def main(): args = parse_args() results = [] for test_name in sent_tests.keys(): encs = sent_tests[test_name] encs = get_encodings(args, encs, debias=args.debias, equalize=args.equalize) esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric) targ1 = list(encs['targ1']['encs'].values()) targ2 = list(encs['targ2']['encs'].values()) attr1 = list(encs['attr1']['encs'].values()) attr2 = list(encs['attr2']['encs'].values()) targets = [targ1, targ2] attributes = [attr1, attr2] weat_score, effect_size = binary_weat(targets, attributes) results.append("{}: esize={} pval={} | w_score={} esize={}".format( test_name, esize, pval, weat_score, effect_size)) for result in results: logger.info(result)
def evaluate(self): args = self.args if (not os.path.exists(args.results_dir)): os.makedirs(args.results_dir) results_path = os.path.join(args.results_dir, args.output_name) results = [] all_tests_dict = dict() for filename in self.filenames: sent_file = os.path.join(DATA_DIR, filename) data = load_json(sent_file) encs = self.get_encodings(data) esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric) result = "{}: esize={} pval={}".format(filename, esize, pval) print(filename, result) results.append(result) test_results = {"esize": esize, "pval": pval} all_tests_dict[filename] = test_results # print and save results for result in results: logger.info(result) save_dict_to_json(all_tests_dict, results_path)
def evaluate(args, word_level=False): '''Evaluate bias level with given definitional sentence pairs.''' results_path = os.path.join(args.results_dir, args.output_name) if (not args.encode_only): if (os.path.exists(results_path)): print("Results already evaluated in {}".format(results_path)) return if (not os.path.exists(args.results_dir)): os.makedirs(args.results_dir) results = [] all_tests_dict = dict() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') print("tokenizer: {}".format(tokenizer==None)) #gender_subspace = None # if (args.debias): # gender_subspace = compute_gender_dir(DEVICE, tokenizer, bert_encoder, def_pairs, # args.max_seq_length, k=args.num_dimension, load=True, task=args.model, word_level=word_level, keepdims=True) # logger.info("Computed (gender) bias direction") with open(args.gendered_words_filename, "r") as f: gender_specific_words = json.load(f) specific_set = set(gender_specific_words) abs_esizes = [] for test_id in ['6', '6b', '7', '7b', '8', '8b']: filename = "sent-weat{}.jsonl".format(test_id) sent_file = os.path.join(DATA_DIR, filename) data = load_json(sent_file) encs = get_encodings(args, data) if (args.encode_only): if (args.debias): outfile_name = 'debiased_encs{}.pkl'.format(test_id) else: outfile_name = 'biased_encs{}.pkl'.format(test_id) with open(os.path.join(args.results_dir, outfile_name), 'wb') as outfile: pickle.dump(encs, outfile) continue ''' encs: targ1, targ2, attr1, attr2 -> category -> encs -> (id1, sent1_emb), (id2, sent2_emb), ... ''' esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric) abs_esizes.append(abs(esize)) result = "{}: esize={} pval={}".format(filename, esize, pval) print(filename, result) results.append(result) test_results = {"esize": esize, "pval": pval} all_tests_dict[filename] = test_results avg_absesize = np.mean(np.array(abs_esizes)) print("Averge of Absolute esize: {}".format(avg_absesize)) all_tests_dict['avg_absesize'] = avg_absesize if (args.encode_only): return # print and save results for result in results: logger.info(result) save_dict_to_json(all_tests_dict, results_path) return
if (args.encode_only): if (args.debias): outfile_name = 'debiased_encs{}.pkl'.format(test_id) else: outfile_name = 'biased_encs{}.pkl'.format(test_id) with open(os.path.join(args.results_dir, outfile_name), 'wb') as outfile: pickle.dump(encs, outfile) continue ''' encs: targ1, targ2, attr1, attr2 -> category -> encs -> (id1, sent1_emb), (id2, sent2_emb), ... ''' esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric) # weat_score, effect_size = run_binary_weat_test(encs) # results.append("{}: esize={} pval={} | w_score={} esize={}".format(filename, # esize, pval, weat_score, effect_size)) # test_results = {"esize": esize, "pval": pval, "weat_score": weat_score, "effect_size": effect_size} result = "{}: esize={} pval={}".format(filename, esize, pval) print(filename, result) results.append(result) test_results = {"esize": esize, "pval": pval} all_tests_dict[filename] = test_results if (args.encode_only): return # print and save results
def main(arguments): ''' Main logic: parse args for tests to run and which models to evaluate ''' log.basicConfig(format='%(asctime)s: %(message)s', datefmt='%m/%d %I:%M:%S %p', level=log.INFO) args = handle_arguments(arguments) if args.seed >= 0: log.info('Seeding random number generators with {}'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) maybe_make_dir(args.exp_dir) if args.log_file: log.getLogger().addHandler(log.FileHandler(args.log_file)) log.info("Parsed args: \n%s", args) all_tests = sorted( [ entry[:-len(TEST_EXT)] for entry in os.listdir(args.data_dir) if not entry.startswith('.') and entry.endswith(TEST_EXT) ], key=test_sort_key ) log.debug('Tests found:') for test in all_tests: log.debug('\t{}'.format(test)) tests = split_comma_and_check(args.tests, all_tests, "test") if args.tests is not None else all_tests log.info('Tests selected:') for test in tests: log.info('\t{}'.format(test)) models = split_comma_and_check(args.models, MODEL_NAMES, "model") if args.models is not None else MODEL_NAMES log.info('Models selected:') for model in models: log.info('\t{}'.format(model)) results = [] for model_name in models: # Different models have different interfaces for things, but generally want to: # - if saved vectors aren't there: # - load the model # - load the test data # - encode the vectors # - dump the files into some storage # - else load the saved vectors ''' log.info('Running tests for model {}'.format(model_name)) if model_name == ModelName.BOW.value: model_options = '' if args.glove_path is None: raise Exception('glove_path must be specified for {} model'.format(model_name)) elif model_name == ModelName.INFERSENT.value: if args.glove_path is None: raise Exception('glove_path must be specified for {} model'.format(model_name)) if args.infersent_dir is None: raise Exception('infersent_dir must be specified for {} model'.format(model_name)) model_options = '' elif model_name == ModelName.GENSEN.value: if args.glove_h5_path is None: raise Exception('glove_h5_path must be specified for {} model'.format(model_name)) if args.gensen_dir is None: raise Exception('gensen_dir must be specified for {} model'.format(model_name)) gensen_version_list = split_comma_and_check(args.gensen_version, GENSEN_VERSIONS, "gensen_prefix") if len(gensen_version_list) > 2: raise ValueError('gensen_version can only have one or two elements') model_options = 'version=' + args.gensen_version elif model_name == ModelName.GUSE.value: model_options = '' elif model_name == ModelName.COVE.value: if args.cove_encs is None: raise Exception('cove_encs must be specified for {} model'.format(model_name)) model_options = '' elif model_name == ModelName.ELMO.value: model_options = 'time_combine={};layer_combine={}'.format( args.time_combine_method, args.layer_combine_method) elif model_name == ModelName.BERT.value: model_options = 'version=' + args.bert_version elif model_name == ModelName.OPENAI.value: if args.openai_encs is None: raise Exception('openai_encs must be specified for {} model'.format(model_name)) model_options = '' else: raise ValueError("Model %s not found!" % model_name) model = None for test in tests: log.info('Running test {} for model {}'.format(test, model_name)) enc_file = os.path.join(args.exp_dir, "%s.%s.h5" % ( "%s;%s" % (model_name, model_options) if model_options else model_name, test)) if not args.ignore_cached_encs and os.path.isfile(enc_file): log.info("Loading encodings from %s", enc_file) encs = load_encodings(enc_file) encs_targ1 = encs['targ1'] encs_targ2 = encs['targ2'] encs_attr1 = encs['attr1'] encs_attr2 = encs['attr2'] else: # load the test data encs = load_json(os.path.join(args.data_dir, "%s%s" % (test, TEST_EXT))) # load the model and do model-specific encoding procedure log.info('Computing sentence encodings') if model_name == ModelName.BOW.value: encs_targ1 = bow.encode(encs["targ1"]["examples"], args.glove_path) encs_targ2 = bow.encode(encs["targ2"]["examples"], args.glove_path) encs_attr1 = bow.encode(encs["attr1"]["examples"], args.glove_path) encs_attr2 = bow.encode(encs["attr2"]["examples"], args.glove_path) elif model_name == ModelName.INFERSENT.value: if model is None: model = infersent.load_infersent(args.infersent_dir, args.glove_path, train_data='all', use_cpu=args.use_cpu) model.build_vocab( [ example for k in ('targ1', 'targ2', 'attr1', 'attr2') for example in encs[k]['examples'] ], tokenize=True) log.info("Encoding sentences for test %s with model %s...", test, model_name) encs_targ1 = infersent.encode(model, encs["targ1"]["examples"]) encs_targ2 = infersent.encode(model, encs["targ2"]["examples"]) encs_attr1 = infersent.encode(model, encs["attr1"]["examples"]) encs_attr2 = infersent.encode(model, encs["attr2"]["examples"]) elif model_name == ModelName.GENSEN.value: if model is None: gensen_1 = gensen.GenSenSingle( model_folder=args.gensen_dir, filename_prefix=gensen_version_list[0], pretrained_emb=args.glove_h5_path, cuda=not args.use_cpu) model = gensen_1 if len(gensen_version_list) == 2: gensen_2 = gensen.GenSenSingle( model_folder=args.gensen_dir, filename_prefix=gensen_version_list[1], pretrained_emb=args.glove_h5_path, cuda=not args.use_cpu) model = gensen.GenSen(gensen_1, gensen_2) vocab = gensen.build_vocab([ s for set_name in ('targ1', 'targ2', 'attr1', 'attr2') for s in encs[set_name]["examples"] ]) model.vocab_expansion(vocab) encs_targ1 = gensen.encode(model, encs["targ1"]["examples"]) encs_targ2 = gensen.encode(model, encs["targ2"]["examples"]) encs_attr1 = gensen.encode(model, encs["attr1"]["examples"]) encs_attr2 = gensen.encode(model, encs["attr2"]["examples"]) elif model_name == ModelName.GUSE.value: model = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2") if args.use_cpu: kwargs = dict(device_count={'GPU': 0}) else: kwargs = dict() config = tf.ConfigProto(**kwargs) config.gpu_options.per_process_gpu_memory_fraction = 0.5 # maximum alloc gpu50% of MEM config.gpu_options.allow_growth = True # allocate dynamically with tf.Session(config=config) as session: session.run([tf.global_variables_initializer(), tf.tables_initializer()]) def guse_encode(sents): encs_node = model(sents) encs = session.run(encs_node) encs_d = {sents[j]: enc for j, enc in enumerate(np.array(encs).tolist())} return encs_d encs_targ1 = guse_encode(encs["targ1"]["examples"]) encs_targ2 = guse_encode(encs["targ2"]["examples"]) encs_attr1 = guse_encode(encs["attr1"]["examples"]) encs_attr2 = guse_encode(encs["attr2"]["examples"]) elif model_name == ModelName.COVE.value: load_encs_from = os.path.join(args.cove_encs, "%s.encs" % test) encs = load_jiant_encodings(load_encs_from, n_header=1) elif model_name == ModelName.ELMO.value: kwargs = dict(time_combine_method=args.time_combine_method, layer_combine_method=args.layer_combine_method) encs_targ1 = elmo.encode(encs["targ1"]["examples"], **kwargs) encs_targ2 = elmo.encode(encs["targ2"]["examples"], **kwargs) encs_attr1 = elmo.encode(encs["attr1"]["examples"], **kwargs) encs_attr2 = elmo.encode(encs["attr2"]["examples"], **kwargs) elif model_name == ModelName.BERT.value: model, tokenizer = bert.load_model(args.bert_version) encs_targ1 = bert.encode(model, tokenizer, encs["targ1"]["examples"]) encs_targ2 = bert.encode(model, tokenizer, encs["targ2"]["examples"]) encs_attr1 = bert.encode(model, tokenizer, encs["attr1"]["examples"]) encs_attr2 = bert.encode(model, tokenizer, encs["attr2"]["examples"]) elif model_name == ModelName.OPENAI.value: load_encs_from = os.path.join(args.openai_encs, "%s.encs" % test) #encs = load_jiant_encodings(load_encs_from, n_header=1, is_openai=True) encs = load_encodings(load_encs_from) encs_targ1 = encs["targ1"]["encs"] encs_targ2 = encs["targ2"]["encs"] encs_attr1 = encs["attr1"]["encs"] encs_attr2 = encs["attr2"]["encs"] else: raise ValueError("Model %s not found!" % model_name) encs["targ1"]["encs"] = encs_targ1 encs["targ2"]["encs"] = encs_targ2 encs["attr1"]["encs"] = encs_attr1 encs["attr2"]["encs"] = encs_attr2 log.info("\tDone!") if not args.dont_cache_encs: log.info("Saving encodings to %s", enc_file) save_encodings(encs, enc_file) enc = [e for e in encs["targ1"]['encs'].values()][0] d_rep = enc.size if isinstance(enc, np.ndarray) else len(enc) # run the test on the encodings log.info("Running SEAT...") log.info("Representation dimension: {}".format(d_rep)) esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric) results.append(dict( model=model_name, options=model_options, test=test, p_value=pval, effect_size=esize, num_targ1=len(encs['targ1']['encs']), num_targ2=len(encs['targ2']['encs']), num_attr1=len(encs['attr1']['encs']), num_attr2=len(encs['attr2']['encs']))) log.info("Model: %s", model_name) log.info('Options: {}'.format(model_options)) for r in results: log.info("\tTest {test}:\tp-val: {p_value:.9f}\tesize: {effect_size:.2f}".format(**r)) if args.results_path is not None: log.info('Writing results to {}'.format(args.results_path)) with open(args.results_path, 'w') as f: writer = DictWriter(f, fieldnames=results[0].keys(), delimiter='\t') writer.writeheader() for r in results: writer.writerow(r)