Exemple #1
0
def extract_sentences_for_names_injection(titles, number_of_sentences,
                                          characters_lists_dir_path,
                                          novels_texts_dir_path,
                                          training_set_dir):
    sentences_per_novel = number_of_sentences / len(titles)
    nlp = spacy.load("en_core_web_sm")

    for title in titles:
        characters = read_file_to_list(characters_lists_dir_path + title)
        novel_text = read_file(novels_texts_dir_path + title)
        names_to_be_replaced = get_names_to_be_replaced(characters)
        doc = nlp(novel_text)
        potential_sentences = []

        for sentence in doc.sents:
            if any(name in sentence.text for name in names_to_be_replaced):
                potential_sentences.append(sentence.text)

        selected_sentences = [
            sent for sent in random.sample(potential_sentences,
                                           k=min(int(sentences_per_novel),
                                                 len(potential_sentences)))
        ]
        test_sample = "\n".join(selected_sentences)

        write_text_to_file(
            training_set_dir + "\\extracted_sentences\\" + title, test_sample)
def main(titles_path, not_recognized_named_entities_person_file_path, novels_texts_file_path, sentences_per_entity, training_set_dir):
    titles = read_file_to_list(titles_path)
    for title in titles:
        named_entites = get_not_recognized_entities(title, not_recognized_named_entities_person_file_path)
        ner_training_set = create_ner_training_set(title, novels_texts_file_path, named_entites, sentences_per_entity, training_set_dir)
        training_set = annotate_training_set(ner_training_set, named_entites)
        with open(training_set_dir + title, 'w') as result:
            json.dump(training_set, result)
        print("One novel done!")
def metrics_per_novel(titles_path, gold_standard_path, result_path, stats_path):
    titles = read_file_to_list(titles_path)
    for title in titles:
        create_and_save_stats(title, gold_standard_path, result_path, stats_path)

    for title in titles:
        metrics = load_from_pickle(title, stats_path)
        print("*****************")
        print(title)
        print(tabulate.tabulate(metrics[1:], headers=metrics[0], tablefmt='orgtbl'))
        print("*****************")
def prepare_training_data(titles_path, training_set_1_dir_path,
                          training_set_2_path):
    titles = read_file_to_list(titles_path)
    train_data = []

    for title in titles:
        data_slice = json_to_spacy_train_data(training_set_1_dir_path + title)
        train_data.extend(data_slice)

    data_second_set = json_to_spacy_train_data(training_set_2_path)
    train_data.extend(data_second_set)

    return train_data
Exemple #5
0
def main(titles_path, characters_lists_dir_path, novels_texts_dir_path,
         sentences_per_common_name, training_set_dir):
    titles = read_file_to_list(titles_path)
    common_names = get_common_names()
    number_sentences_to_extracted = sentences_per_common_name * len(
        common_names)
    extract_sentences_for_names_injection(titles,
                                          number_sentences_to_extracted,
                                          characters_lists_dir_path,
                                          novels_texts_dir_path,
                                          training_set_dir)
    sentences = []
    names_to_be_replaced = []
    for title in titles:
        data = read_sentences_from_file(training_set_dir +
                                        "\\extracted_sentences\\" + title)
        sentences.extend(data)
        characters = read_file_to_list(characters_lists_dir_path + title)
        names_to_be_replaced.extend(get_names_to_be_replaced(characters))

    training_set, updated_sentences = inject_common_names(
        common_names, sentences, names_to_be_replaced)
    with open(training_set_dir + "training_set.json", 'w+') as result:
        json.dump(training_set, result)
def ner_metrics(titles_path, gold_standard_path, result_path, stats_path):
    titles = read_file_to_list(titles_path)
    for title in titles:
        create_and_save_stats(title, gold_standard_path, result_path, stats_path, ner=True)

    metrics_table = []
    headers = ["Novel title", "precision", "recall", "F-measure", "support"]

    for title in titles:
        metrics = load_from_pickle(title, stats_path)
        metrics_title = [title].__add__([m[0] for m in metrics])
        metrics_table.append(metrics_title)

    metrics = create_overall_stats(titles, gold_standard_path, result_path, stats_path, ner=True)
    metrics_table.append(["*** overall results ***"].__add__([m[0] for m in metrics]))
    print(tabulate.tabulate(metrics_table, headers=headers, tablefmt='latex'))
Exemple #7
0
def main(titles_path,
         names_gold_standard_dir_path,
         testing_data_dir_path,
         generated_data_dir,
         ner_model_dir_path=None):
    titles = read_file_to_list(titles_path)
    generate_generalized_data(titles, names_gold_standard_dir_path,
                              generated_data_dir)

    for title in titles:
        test_data = read_sentences_from_file(testing_data_dir_path + title)
        ner_result = test_ner(test_data, ner_model_dir_path)

        path = generated_data_dir + "ner_model_annotated\\" + title

        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'w+') as result:
            json.dump(ner_result, result)
Exemple #8
0
def run_matcher(titles_path,
                model_path,
                characters_lists_dir_path,
                texts_dir_path,
                results_dir,
                precision=75,
                tests_variant=True):
    names_matcher = NamesMatcher(precision, model_path)
    titles = read_file_to_list(titles_path)
    for title in titles:
        if tests_variant:
            characters, text = get_test_data_for_novel(
                title, characters_lists_dir_path, texts_dir_path)
        else:
            characters, text = get_complete_data_about_novel(
                title, characters_lists_dir_path, texts_dir_path)
        matches_table = names_matcher.match_names_for_text(characters,
                                                           text,
                                                           results_dir,
                                                           title,
                                                           tests_variant,
                                                           save_ratios=True)

    print(tabulate(matches_table, tablefmt='orgtbl'))
Exemple #9
0
def main(titles_path, novels_texts_dir_path, number_of_sentences,
         generated_data_dir):
    titles = read_file_to_list(titles_path)

    generate_sample_test_data(titles, number_of_sentences,
                              novels_texts_dir_path, generated_data_dir)
def get_not_recognized_entities(title, not_recognized_named_entities_person_file_path):
    named_entities = read_file_to_list(not_recognized_named_entities_person_file_path + title)
    return named_entities
Exemple #11
0
def get_common_names(common_names_path=COMMON_NAMES_FILE):
    common_names = read_file_to_list(common_names_path)
    return common_names
def overall_metrics(titles_path, gold_standard_path, result_path, stats_path):
    titles = read_file_to_list(titles_path)
    metrics = create_overall_stats(titles, gold_standard_path, result_path, stats_path)
    print(tabulate.tabulate(metrics[1:], headers=metrics[0], tablefmt='orgtbl'))
Exemple #13
0
def get_complete_data_about_novel(title, characters_lists_dir_path,
                                  novels_texts_dir_path):
    characters = read_file_to_list(characters_lists_dir_path + title)
    novel_text = read_file(novels_texts_dir_path + title)
    return characters, novel_text
Exemple #14
0
def get_test_data_for_novel(title, characters_lists_dir_path,
                            testing_sets_dir_path):
    characters = read_file_to_list(characters_lists_dir_path + title)
    text = read_sentences_from_file(testing_sets_dir_path + title)
    return characters, text