def process_file(solve, input_file): print("processing %s" % (input_file, )) output_file = get_output_file(input_file) world = parse(input_file=os.path.join('./input_files', input_file)) analyze_world(world) t0 = time.time() solution = solve(world) t1 = time.time() print("solution took %.1f sec" % (t1 - t0, )) score = calculate_score(world, solution) t2 = time.time() print("calculate score took %.1f sec" % (t2 - t1, )) print("SCORE: %d" % score) write(solution, output_file) return score
def run_expander_experiment(exp_name, bert, bert_tokenizer): seed = cfg.general_config['seed'] output_file = utils.get_output_file(seed, exp_name) if exp_name == "MPB1" or exp_name.startswith( "sent_num") or exp_name == "BB": expanded = MPB1.expand_with_mpb1(seed, output_file, bert, bert_tokenizer) elif exp_name == "MPB2" or exp_name.startswith("sim_param"): expanded = MPB2.expand_with_mpb2(seed, output_file, bert, bert_tokenizer) else: print("wrong exp name.") return -1 map_score = utils.evaluate(expanded, output_file, cfg.general_config['set_file']) return map_score
def expand_with_mpb2(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None): if not bert or not bert_tokenizer: bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert( "bert-large-uncased") if not log_output_file: log_output_file = utils.get_output_file(seed_terms) # TODO: perhaps what we want to modify candidates = sence2vec_utils.get_candidates_closest_to_seed_terms( seed_terms, cfg.general_config['size_of_expanded'], cfg.MPB2_config['total_terms_to_consider']) # candidates = word2vec_utils.get_candidates_closest_to_seed_terms(seed_terms, # cfg.general_config['size_of_expanded'], # cfg.MPB2_config['total_terms_to_consider']) log_output_file.write("Using " + str(len(candidates)) + " candidates\n") if cfg.MPB2_config['assume_oracle_candidates']: expected_terms = utils.get_first_syn_of_terms_from_file( cfg.general_config['set_file']) utils.print_candidate_stats_to_output_file(log_output_file, candidates, expected_terms, len(expected_terms)) candidates = candidates.union(expected_terms) masked_sentences = utils.get_masked_sentences_for_seed( seed_terms, log_output_file, cfg.general_config['num_of_sentences'], cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'], bert_tokenizer) indicative_patterns_and_max_positions = model_utils.get_indicative_patterns( bert, bert_tokenizer, masked_sentences, seed_terms, cfg.general_config['num_of_indicative_patterns'], log_output_file, cfg.general_config['batch_size']) indicative_patterns = [i[0] for i in indicative_patterns_and_max_positions] indicative_patterns_and_top_suggestions = model_utils.get_patterns_top_k_model_suggestions_for_each_pattern\ (bert, bert_tokenizer, indicative_patterns, cfg.MPB2_config['similarity_param'], cfg.general_config['batch_size']) candidate_score_list = score_candidates( bert, bert_tokenizer, candidates, indicative_patterns_and_max_positions, indicative_patterns_and_top_suggestions) utils.print_expansion_with_scores_to_output_file(candidate_score_list, log_output_file) return [i[0] for i in candidate_score_list]
def expand_with_mpb1(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None): if not bert or not bert_tokenizer: bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased") if not log_output_file: log_output_file = utils.get_output_file(seed_terms) masked_sentences = utils.get_masked_sentences_for_seed(seed_terms, log_output_file, cfg.general_config['num_of_sentences'], cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'], bert_tokenizer) indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(bert, bert_tokenizer, masked_sentences, seed_terms, cfg.general_config['num_of_indicative_patterns'], log_output_file, cfg.general_config['batch_size']) results = model_utils.get_models_top_k_suggestions_for_group_of_patterns(bert, bert_tokenizer, indicative_patterns_and_max_positions, cfg.general_config['size_of_expanded'], cfg.general_config['batch_size']) utils.print_expansion_to_output_file(results, log_output_file) return results
def expand_with_mpb1(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None): if not bert or not bert_tokenizer: bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased") if not log_output_file: log_output_file = utils.get_output_file(seed_terms) masked_sentences = utils.get_masked_sentences_for_seed(seed_terms, log_output_file, cfg.general_config['num_of_sentences'], cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'], bert_tokenizer) indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(bert, bert_tokenizer, masked_sentences, seed_terms, cfg.general_config['num_of_indicative_patterns'], log_output_file, cfg.general_config['batch_size']) results = model_utils.get_models_top_k_suggestions_for_group_of_patterns(bert, bert_tokenizer, indicative_patterns_and_max_positions, cfg.general_config['size_of_expanded'], cfg.general_config['batch_size']) utils.print_expansion_to_output_file(results, log_output_file) return results if __name__ == "__main__": seed = cfg.general_config['seed'] output_file = utils.get_output_file(seed) model, tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased") expanded = expand_with_mpb1(seed, output_file, model, tokenizer) utils.evaluate(expanded, output_file, cfg.general_config['set_file'])
cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'], bert_tokenizer) indicative_patterns_and_max_positions = model_utils.get_indicative_patterns( bert, bert_tokenizer, masked_sentences, seed_terms, cfg.general_config['num_of_indicative_patterns'], log_output_file, cfg.general_config['batch_size']) indicative_patterns = [i[0] for i in indicative_patterns_and_max_positions] indicative_patterns_and_top_suggestions = model_utils.get_patterns_top_k_model_suggestions_for_each_pattern\ (bert, bert_tokenizer, indicative_patterns, cfg.MPB2_config['similarity_param'], cfg.general_config['batch_size']) candidate_score_list = score_candidates( bert, bert_tokenizer, candidates, indicative_patterns_and_max_positions, indicative_patterns_and_top_suggestions) utils.print_expansion_with_scores_to_output_file(candidate_score_list, log_output_file) return [i[0] for i in candidate_score_list] if __name__ == "__main__": if type(cfg.general_config['seed']) is list: seed = cfg.general_config['seed'] else: seed = utils.read_seeds(cfg.general_config['seed']) # output_file = utils.get_output_file(seed) output_file = utils.get_output_file(seed, description='race_without_black_white') model, tokenizer = model_utils.get_model_and_tokenizer_bert( "bert-large-uncased") expanded = expand_with_mpb2(seed, output_file, model, tokenizer) # utils.evaluate(expanded, output_file, cfg.general_config['set_file'])