Exemple #1
0
def calculate_recall_for_kw_candidates(data_dir, recreate_ontology=False, verbose=False):
    """
    Generate keyword candidates for files in a given directory
    and compute their recall in reference to ground truth answers
    :param data_dir: directory with .txt and .key files
    :param recreate_ontology: boolean flag for recreating the ontology
    :param verbose: whether to print computation times

    :return average_recall: float
    """
    average_recall = 0
    total_kw_number = 0

    ontology = get_ontology(recreate=recreate_ontology)
    docs = get_documents(data_dir)
    considered_keywords = set(get_keywords())
    total_docs = 0

    start_time = time.clock()
    for doc in docs:
        kw_candidates = {kw.get_canonical_form() for kw
                         in generate_keyword_candidates(doc, ontology)}

        answers = get_answers_for_doc(doc.filename, data_dir, filtered_by=considered_keywords)
        # print(document.get_meaningful_words())

        # print(u"Candidates:")
        # for kw in sorted(kw_candidates):
        #     print(u"\t" + unicode(kw))
        # print
        #
        # print(u"Answers:")
        # for kw in sorted(answers):
        #     print(u"\t" + unicode(kw))
        # print
        #
        # print(u"Conjunction:")
        # for kw in sorted(kw_candidates & answers):
        #     print(u"\t" + unicode(kw))
        # print

        recall = 1 if not answers else len(kw_candidates & answers) / (len(answers))
        if verbose:
            print
            print("Paper: " + doc.filename)
            print("Candidates: " + str(len(kw_candidates)))
            print("Recall: " + unicode(recall * 100) + "%")

        average_recall += recall
        total_kw_number += len(kw_candidates)
        total_docs += 1

    average_recall /= total_docs

    if verbose:
        print
        print("Total # of keywords: " + str(total_kw_number))
        print("Time elapsed: " + str(time.clock() - start_time))

    return average_recall
def remove_not_considered_keywords(candidates):
    """
    Filters the candidates in place, dropping the ones that we should not consider
    :param candidates: set of KeywordTokens

    :return: filtered set of KeywordTokens
    """
    if NO_OF_LABELS < 0:
        return candidates
    considered_kw = set(get_keywords())
    return {kt for kt in candidates if kt.get_canonical_form() in considered_kw}
Exemple #3
0
def test(
    testset_path,
    ontology=ONTOLOGY_PATH,
    model=MODEL_PATH,
    recreate_ontology=False,
    verbose=True,
):
    """
    Test the trained model on a set under a given path.
    :param testset_path: path to the directory with the test set
    :param ontology: path to the ontology
    :param model: path where the model is pickled
    :param recreate_ontology: boolean flag whether to recreate the ontology
    :param verbose: whether to print computation times

    :return tuple of three floats (precision, recall, f1_score)
    """
    if type(model) in [str, unicode]:
        model = load_from_disk(model)

    if type(ontology) in [str, unicode]:
        ontology = get_ontology(path=ontology, recreate=recreate_ontology)

    keywords = get_keywords()
    keyword_indices = {kw: i for i, kw in enumerate(keywords)}

    all_metrics = calculate_basic_metrics([range(5)]).keys()
    metrics_agg = {m: [] for m in all_metrics}

    for doc in get_documents(testset_path, as_generator=True):
        x, answers, kw_vector = build_test_matrices(
            [doc],
            model,
            testset_path,
            ontology,
        )

        y_true = build_y_true(answers, keyword_indices, doc.doc_id)

        # Predict
        ranking = model.scale_and_predict(x.as_matrix())

        y_pred = y_true[0][ranking[::-1]]

        metrics = calculate_basic_metrics([y_pred])

        for k, v in metrics.iteritems():
            metrics_agg[k].append(v)

    return {k: np.mean(v) for k, v in metrics_agg.iteritems()}
Exemple #4
0
def generate_keyword_candidates(document, ontology):
    """
    :param document: Document object containing the text that we generate
    generate keywords from
    :param ontology: Ontology object on which we match the keywords
    :return:
    """
    if NO_OF_LABELS <= 500:
        return {KeywordToken(i, canonical_label=kw, parsed_label=ontology.parse_label(kw))
                for i, kw in enumerate(get_keywords())}

    if STRATEGY == 'NGRAMS':
        return generate_ngram_candidates(document, ontology)
    elif STRATEGY == 'SUBGRAPH':
        return generate_subgraph_candidates(document, ontology)
    elif STRATEGY == 'ENSEMBLE':
        return generate_subgraph_candidates(document, ontology)\
            | generate_ngram_candidates(document, ontology)
    else:
        raise ValueError("Unknown STRATEGY = " + STRATEGY)
def generate_keyword_candidates(document, ontology):
    """
    :param document: Document object containing the text that we generate
    generate keywords from
    :param ontology: Ontology object on which we match the keywords
    :return:
    """
    if NO_OF_LABELS <= 500:
        return {
            KeywordToken(i,
                         canonical_label=kw,
                         parsed_label=ontology.parse_label(kw))
            for i, kw in enumerate(get_keywords())
        }

    if STRATEGY == 'NGRAMS':
        return generate_ngram_candidates(document, ontology)
    elif STRATEGY == 'SUBGRAPH':
        return generate_subgraph_candidates(document, ontology)
    elif STRATEGY == 'ENSEMBLE':
        return generate_subgraph_candidates(document, ontology)\
            | generate_ngram_candidates(document, ontology)
    else:
        raise ValueError("Unknown STRATEGY = " + STRATEGY)
Exemple #6
0
def calculate_recall_for_kw_candidates(data_dir,
                                       recreate_ontology=False,
                                       verbose=False):
    """
    Generate keyword candidates for files in a given directory
    and compute their recall in reference to ground truth answers
    :param data_dir: directory with .txt and .key files
    :param recreate_ontology: boolean flag for recreating the ontology
    :param verbose: whether to print computation times

    :return average_recall: float
    """
    average_recall = 0
    total_kw_number = 0

    ontology = get_ontology(recreate=recreate_ontology)
    docs = get_documents(data_dir)
    considered_keywords = set(get_keywords())
    total_docs = 0

    start_time = time.clock()
    for doc in docs:
        kw_candidates = {
            kw.get_canonical_form()
            for kw in generate_keyword_candidates(doc, ontology)
        }

        answers = get_answers_for_doc(doc.filename,
                                      data_dir,
                                      filtered_by=considered_keywords)
        # print(document.get_meaningful_words())

        # print(u"Candidates:")
        # for kw in sorted(kw_candidates):
        #     print(u"\t" + unicode(kw))
        # print
        #
        # print(u"Answers:")
        # for kw in sorted(answers):
        #     print(u"\t" + unicode(kw))
        # print
        #
        # print(u"Conjunction:")
        # for kw in sorted(kw_candidates & answers):
        #     print(u"\t" + unicode(kw))
        # print

        recall = 1 if not answers else len(kw_candidates
                                           & answers) / (len(answers))
        if verbose:
            print
            print("Paper: " + doc.filename)
            print("Candidates: " + str(len(kw_candidates)))
            print("Recall: " + unicode(recall * 100) + "%")

        average_recall += recall
        total_kw_number += len(kw_candidates)
        total_docs += 1

    average_recall /= total_docs

    if verbose:
        print
        print("Total # of keywords: " + str(total_kw_number))
        print("Time elapsed: " + str(time.clock() - start_time))

    return average_recall