def calculate_recall_for_kw_candidates(data_dir, recreate_ontology=False, verbose=False): """ Generate keyword candidates for files in a given directory and compute their recall in reference to ground truth answers :param data_dir: directory with .txt and .key files :param recreate_ontology: boolean flag for recreating the ontology :param verbose: whether to print computation times :return average_recall: float """ average_recall = 0 total_kw_number = 0 ontology = get_ontology(recreate=recreate_ontology) docs = get_documents(data_dir) considered_keywords = set(get_keywords()) total_docs = 0 start_time = time.clock() for doc in docs: kw_candidates = {kw.get_canonical_form() for kw in generate_keyword_candidates(doc, ontology)} answers = get_answers_for_doc(doc.filename, data_dir, filtered_by=considered_keywords) # print(document.get_meaningful_words()) # print(u"Candidates:") # for kw in sorted(kw_candidates): # print(u"\t" + unicode(kw)) # print # # print(u"Answers:") # for kw in sorted(answers): # print(u"\t" + unicode(kw)) # print # # print(u"Conjunction:") # for kw in sorted(kw_candidates & answers): # print(u"\t" + unicode(kw)) # print recall = 1 if not answers else len(kw_candidates & answers) / (len(answers)) if verbose: print print("Paper: " + doc.filename) print("Candidates: " + str(len(kw_candidates))) print("Recall: " + unicode(recall * 100) + "%") average_recall += recall total_kw_number += len(kw_candidates) total_docs += 1 average_recall /= total_docs if verbose: print print("Total # of keywords: " + str(total_kw_number)) print("Time elapsed: " + str(time.clock() - start_time)) return average_recall
def remove_not_considered_keywords(candidates): """ Filters the candidates in place, dropping the ones that we should not consider :param candidates: set of KeywordTokens :return: filtered set of KeywordTokens """ if NO_OF_LABELS < 0: return candidates considered_kw = set(get_keywords()) return {kt for kt in candidates if kt.get_canonical_form() in considered_kw}
def test( testset_path, ontology=ONTOLOGY_PATH, model=MODEL_PATH, recreate_ontology=False, verbose=True, ): """ Test the trained model on a set under a given path. :param testset_path: path to the directory with the test set :param ontology: path to the ontology :param model: path where the model is pickled :param recreate_ontology: boolean flag whether to recreate the ontology :param verbose: whether to print computation times :return tuple of three floats (precision, recall, f1_score) """ if type(model) in [str, unicode]: model = load_from_disk(model) if type(ontology) in [str, unicode]: ontology = get_ontology(path=ontology, recreate=recreate_ontology) keywords = get_keywords() keyword_indices = {kw: i for i, kw in enumerate(keywords)} all_metrics = calculate_basic_metrics([range(5)]).keys() metrics_agg = {m: [] for m in all_metrics} for doc in get_documents(testset_path, as_generator=True): x, answers, kw_vector = build_test_matrices( [doc], model, testset_path, ontology, ) y_true = build_y_true(answers, keyword_indices, doc.doc_id) # Predict ranking = model.scale_and_predict(x.as_matrix()) y_pred = y_true[0][ranking[::-1]] metrics = calculate_basic_metrics([y_pred]) for k, v in metrics.iteritems(): metrics_agg[k].append(v) return {k: np.mean(v) for k, v in metrics_agg.iteritems()}
def generate_keyword_candidates(document, ontology): """ :param document: Document object containing the text that we generate generate keywords from :param ontology: Ontology object on which we match the keywords :return: """ if NO_OF_LABELS <= 500: return {KeywordToken(i, canonical_label=kw, parsed_label=ontology.parse_label(kw)) for i, kw in enumerate(get_keywords())} if STRATEGY == 'NGRAMS': return generate_ngram_candidates(document, ontology) elif STRATEGY == 'SUBGRAPH': return generate_subgraph_candidates(document, ontology) elif STRATEGY == 'ENSEMBLE': return generate_subgraph_candidates(document, ontology)\ | generate_ngram_candidates(document, ontology) else: raise ValueError("Unknown STRATEGY = " + STRATEGY)
def generate_keyword_candidates(document, ontology): """ :param document: Document object containing the text that we generate generate keywords from :param ontology: Ontology object on which we match the keywords :return: """ if NO_OF_LABELS <= 500: return { KeywordToken(i, canonical_label=kw, parsed_label=ontology.parse_label(kw)) for i, kw in enumerate(get_keywords()) } if STRATEGY == 'NGRAMS': return generate_ngram_candidates(document, ontology) elif STRATEGY == 'SUBGRAPH': return generate_subgraph_candidates(document, ontology) elif STRATEGY == 'ENSEMBLE': return generate_subgraph_candidates(document, ontology)\ | generate_ngram_candidates(document, ontology) else: raise ValueError("Unknown STRATEGY = " + STRATEGY)
def calculate_recall_for_kw_candidates(data_dir, recreate_ontology=False, verbose=False): """ Generate keyword candidates for files in a given directory and compute their recall in reference to ground truth answers :param data_dir: directory with .txt and .key files :param recreate_ontology: boolean flag for recreating the ontology :param verbose: whether to print computation times :return average_recall: float """ average_recall = 0 total_kw_number = 0 ontology = get_ontology(recreate=recreate_ontology) docs = get_documents(data_dir) considered_keywords = set(get_keywords()) total_docs = 0 start_time = time.clock() for doc in docs: kw_candidates = { kw.get_canonical_form() for kw in generate_keyword_candidates(doc, ontology) } answers = get_answers_for_doc(doc.filename, data_dir, filtered_by=considered_keywords) # print(document.get_meaningful_words()) # print(u"Candidates:") # for kw in sorted(kw_candidates): # print(u"\t" + unicode(kw)) # print # # print(u"Answers:") # for kw in sorted(answers): # print(u"\t" + unicode(kw)) # print # # print(u"Conjunction:") # for kw in sorted(kw_candidates & answers): # print(u"\t" + unicode(kw)) # print recall = 1 if not answers else len(kw_candidates & answers) / (len(answers)) if verbose: print print("Paper: " + doc.filename) print("Candidates: " + str(len(kw_candidates))) print("Recall: " + unicode(recall * 100) + "%") average_recall += recall total_kw_number += len(kw_candidates) total_docs += 1 average_recall /= total_docs if verbose: print print("Total # of keywords: " + str(total_kw_number)) print("Time elapsed: " + str(time.clock() - start_time)) return average_recall