def ground_one_with_gold(s_g, gold_answers, min_fscore): grounded_graphs = [ apply_grounding(s_g, p) for p in graph_queries.get_graph_groundings(s_g) ] logger.debug("Number of possible groundings: {}".format( len(grounded_graphs))) logger.debug("First one: {}".format(grounded_graphs[:1])) i = 0 chosen_graphs, not_chosen_graphs = [], [] last_f1 = 0.0 while i < len(grounded_graphs) and last_f1 < MIN_F_SCORE_TO_STOP: s_g = grounded_graphs[i] s_g.denotations = graph_queries.get_graph_denotations(s_g) i += 1 retrieved_answers = s_g.denotations evaluation_results = evaluation.retrieval_prec_rec_f1( gold_answers, retrieved_answers) last_f1 = evaluation_results[2] if last_f1 > min_fscore: chosen_graphs.append(WithScore(s_g, evaluation_results)) elif last_f1 < 0.05: not_chosen_graphs.append(WithScore(s_g, evaluation_results)) return chosen_graphs, not_chosen_graphs
def test_query_graph_denotations(): denotations = [] for test_graph in test_graphs_grounded: result = graph_queries.get_graph_denotations(test_graph) denotations.append(result) assert len(result) > 0 assert denotations[2] == ['Q76'] assert denotations[3] == ['1972'] assert denotations[4] == ['60655', '60601', '60827', '60601', '60827'] assert denotations[6] == ['Q84'] assert len(denotations[7]) == 3 assert all(a in denotations[9] for a in {"Q21", "Q145"}) assert 'Q36465' in denotations[10] assert len(denotations[12]) == 1 for test_graph in test_graphs_without_groundings: result = graph_queries.get_graph_denotations(test_graph) assert len(result) == 0
def generate(path_to_model, config_file_path): config, logger = config_utils.load_config(config_file_path) if "evaluation" not in config: print("Evaluation parameters not in the config file!") sys.exit() with open(config['evaluation']['questions']) as f: webquestions_questions = json.load(f) entitylinker = None if 'entity.linking' in config: PATH_EL = "../../entity-linking/" sys.path.insert(0, PATH_EL) from entitylinking import core linking_config = config['entity.linking'] logger.info("Load entity linker") entitylinker = getattr(core, linking_config['linker'])( logger=logger, **linking_config['linker.options'], pos_tags=True) _, word2idx = V.extend_embeddings_with_special_tokens( *_utils.load_word_embeddings( _utils.RESOURCES_FOLDER + "../../resources/embeddings/glove/glove.6B.100d.txt")) V.WORD_2_IDX = word2idx model_type = path_to_model.split("/")[-1].split("_")[0] logger.info(f"Model type: {model_type}") logger.info('Loading the model from: {}'.format(path_to_model)) dummy_net = getattr(models, model_type)() container = fackel.TorchContainer(torch_model=dummy_net, logger=logger) container.load_from_file(path_to_model) graph_queries.FREQ_THRESHOLD = config['evaluation'].get( "min.relation.freq", 500) logger.debug('Testing') global_answers = [] avg_metrics = np.zeros(3) data_iterator = tqdm.tqdm(webquestions_questions, ncols=100, ascii=True) for i, q_obj in enumerate(data_iterator): q = q_obj.get('utterance', q_obj.get('question')) q_index = q_obj['questionid'] if entitylinker: sent = entitylinker.link_entities_in_raw_input(q, element_id=q_index) if "max.num.entities" in config['evaluation']: sent.entities = sent.entities[:config['evaluation'] ["max.num.entities"]] sent = sentence.Sentence(input_text=sent.input_text, tagged=sent.tagged, entities=sent.entities) else: tagged = _utils.get_tagged_from_server(q, caseless=q.islower()) sent = sentence.Sentence(input_text=q, tagged=tagged, entities=q_obj['entities']) chosen_graphs = staged_generation.generate_with_model( sent, container, beam_size=config['evaluation'].get("beam.size", 10)) model_answers = [] g = ({}, ) if chosen_graphs: j = 0 while not model_answers and j < len(chosen_graphs): g = chosen_graphs[j] model_answers = graph_queries.get_graph_denotations(g.graph) j += 1 gold_answers = webquestions_io.get_answers_from_question(q_obj) metrics = evaluation.retrieval_prec_rec_f1(gold_answers, model_answers) global_answers.append((q_index, list(metrics), model_answers, [ (c_g.graph, float(c_g.scores[2])) for c_g in chosen_graphs[:10] ])) avg_metrics += metrics precision, recall, f1 = tuple(avg_metrics / (i + 1)) data_iterator.set_postfix(prec=precision, rec=recall, f1=f1) if i > 0 and i % 100 == 0: with open(config['evaluation']["save.answers.to"], 'w') as answers_out: json.dump(global_answers, answers_out, sort_keys=True, indent=4, cls=sentence.SentenceEncoder) print("Average metrics: {}".format( (avg_metrics / (len(webquestions_questions))))) logger.debug('Testing is finished') with open(config['evaluation']["save.answers.to"], 'w') as answers_out: json.dump(global_answers, answers_out, sort_keys=True, indent=4, cls=sentence.SentenceEncoder)
def generate(path_to_model, config_file_path, seed, gpuid, experiment_tag): config, logger = config_utils.load_config(config_file_path, gpuid=gpuid, seed=seed) if "evaluation" not in config: print("Evaluation parameters not in the config file!") sys.exit() # Get the data set name and load the data set as specified in the config file dataset_name = config['evaluation']['questions'].split("/")[-1].split( ".")[0] logger.info(f"Dataset: {dataset_name}") with open(config['evaluation']['questions']) as f: webquestions_questions = json.load(f) # Load the entity linker if specified, otherwise the entity annotations in the data set will be used entitylinker = None if 'entity.linking' in config: PATH_EL = "../../entity-linking/" sys.path.insert(0, PATH_EL) from entitylinking import core linking_config = config['entity.linking'] logger.info("Load entity linker") entitylinker = getattr(core, linking_config['linker'])( logger=logger, **linking_config['linker.options'], pos_tags=True) # Load the GloVe word embeddings and embeddings for special tokens _, word2idx = V.extend_embeddings_with_special_tokens( *_utils.load_word_embeddings( _utils.RESOURCES_FOLDER + "../../resources/embeddings/glove/glove.6B.100d.txt")) # Set the global mapping for words to indices V.WORD_2_IDX = word2idx # Derive the model type and the full model name from the model file model_type = path_to_model.split("/")[-1].split("_")[0] model_name = path_to_model.split("/")[-1].replace(".pkl", "") logger.info(f"Model type: {model_type}") logger.info('Loading the model from: {}'.format(path_to_model)) # Load the PyTorch model dummy_net = getattr(models, model_type)() container = fackel.TorchContainer(torch_model=dummy_net, logger=logger) container.load_from_file(path_to_model) model_gated = container._model._gnn.hp_gated if model_type == "GNNModel" else False # Load the freebase entity set that was used top restrict the answer space by the previous work if specified. freebase_entity_set = set() if config['evaluation'].get('entities.list', False): print(f"Using the Freebase entity list") freebase_entity_set = _utils.load_blacklist(_utils.RESOURCES_FOLDER + "freebase-entities.txt") # Compose a file name for the output file save_answer_to = config['evaluation']["save.answers.to"] if not save_answer_to.endswith(".json"): dir_name = config['evaluation'][ "save.answers.to"] + f"{dataset_name}/{model_type.lower()}/" save_answer_to = dir_name + f"{dataset_name}_predictions_{'g' if model_gated else ''}{model_name.lower()}.json" if not os.path.exists(dir_name): os.makedirs(dir_name) print(f"Save output to {save_answer_to}") # Init the variables to store the results logger.debug('Testing') graph_queries.FREQ_THRESHOLD = config['evaluation'].get( "min.relation.freq", 500) global_answers = [] avg_metrics = np.zeros(4) # Iterate over the questions in the dataset data_iterator = tqdm.tqdm(webquestions_questions, ncols=100, ascii=True) for i, q_obj in enumerate(data_iterator): q = q_obj.get('utterance', q_obj.get('question')) q_index = q_obj['questionid'] if entitylinker: sent = entitylinker.link_entities_in_raw_input(q, element_id=q_index) if "max.num.entities" in config['evaluation']: sent.entities = sent.entities[:config['evaluation'] ["max.num.entities"]] sent = sentence.Sentence(input_text=sent.input_text, tagged=sent.tagged, entities=sent.entities) else: tagged = _utils.get_tagged_from_server(q, caseless=q.islower()) sent = sentence.Sentence(input_text=q, tagged=tagged, entities=q_obj['entities']) chosen_graphs = staged_generation.generate_with_model( sent, container, beam_size=config['evaluation'].get("beam.size", 10)) model_answers = [] g = ({}, ) j = -1 if chosen_graphs: j = 0 valid_answer_set = False while not valid_answer_set and j < len(chosen_graphs): g = chosen_graphs[j] model_answers = graph_queries.get_graph_denotations(g.graph) if model_answers: valid_answer_set = True if freebase_entity_set: labeled_answers = { l.lower() for _, labels in queries.get_labels_for_entities( model_answers).items() for l in labels } valid_answer_set = len( labeled_answers & freebase_entity_set) > len(model_answers) - 1 j += 1 gold_answers = webquestions_io.get_answers_from_question(q_obj) metrics = evaluation.retrieval_prec_rec_f1(gold_answers, model_answers) global_answers.append((q_index, list(metrics), model_answers, [ (c_g.graph, float(c_g.scores[2])) for c_g in chosen_graphs[:10] ])) avg_metrics += metrics + (j, ) precision, recall, f1, g_j = tuple(avg_metrics / (i + 1)) data_iterator.set_postfix(prec=precision, rec=recall, f1=f1, g_j=g_j) # Save intermediate results if i > 0 and i % 100 == 0: with open(save_answer_to, 'w') as answers_out: json.dump(global_answers, answers_out, sort_keys=True, indent=4, cls=sentence.SentenceEncoder) avg_metrics = avg_metrics / (len(webquestions_questions)) print("Average metrics: {}".format(avg_metrics)) # Fine-grained results, if there is a mapping of questions to the number of relation to find the correct answer results_by_hops = {} if "qid2hop" in config['evaluation']: with open(config['evaluation']['qid2hop']) as f: q_index2hop = json.load(f) print("Results by hop: ") hops_dist = Counter([q_index2hop[p[0]] for p in global_answers]) results_by_hops = { i: np.zeros(3) for i in range(max(hops_dist.keys()) + 1) } for p in global_answers: metrics = tuple(p[1]) results_by_hops[q_index2hop[p[0]]] += metrics for m in results_by_hops: if hops_dist[m] > 0: results_by_hops[m] = results_by_hops[m] / hops_dist[m] print(results_by_hops) # Add results to the results file if "add.results.to" in config['evaluation']: print(f"Adding results to {config['evaluation']['add.results.to']}") with open(config['evaluation']["add.results.to"], 'a+') as results_out: results_out.write(",".join([ model_name, model_type, "Gated" if model_gated else "Simple", str(seed), dataset_name, "full", "EntityList" if freebase_entity_set else "NoEntityList" ] + [str(el) for el in avg_metrics[:3]])) results_out.write("\n") # Include fine grained results if available if results_by_hops: for i in range(max(results_by_hops.keys()) + 1): results_out.write(",".join([ model_name, model_type, "Gated" if model_gated else "Simple", container.description, str(seed), dataset_name, str(i), "EntityList" if freebase_entity_set else "NoEntityList" ] + [str(el) for el in results_by_hops[i]] + [experiment_tag])) results_out.write("\n") # Save final model output with open(save_answer_to, 'w') as answers_out: json.dump(global_answers, answers_out, sort_keys=True, indent=4, cls=sentence.SentenceEncoder)