def extract_sentences_for_names_injection(titles, number_of_sentences, characters_lists_dir_path, novels_texts_dir_path, training_set_dir): sentences_per_novel = number_of_sentences / len(titles) nlp = spacy.load("en_core_web_sm") for title in titles: characters = read_file_to_list(characters_lists_dir_path + title) novel_text = read_file(novels_texts_dir_path + title) names_to_be_replaced = get_names_to_be_replaced(characters) doc = nlp(novel_text) potential_sentences = [] for sentence in doc.sents: if any(name in sentence.text for name in names_to_be_replaced): potential_sentences.append(sentence.text) selected_sentences = [ sent for sent in random.sample(potential_sentences, k=min(int(sentences_per_novel), len(potential_sentences))) ] test_sample = "\n".join(selected_sentences) write_text_to_file( training_set_dir + "\\extracted_sentences\\" + title, test_sample)
def main(titles_path, not_recognized_named_entities_person_file_path, novels_texts_file_path, sentences_per_entity, training_set_dir): titles = read_file_to_list(titles_path) for title in titles: named_entites = get_not_recognized_entities(title, not_recognized_named_entities_person_file_path) ner_training_set = create_ner_training_set(title, novels_texts_file_path, named_entites, sentences_per_entity, training_set_dir) training_set = annotate_training_set(ner_training_set, named_entites) with open(training_set_dir + title, 'w') as result: json.dump(training_set, result) print("One novel done!")
def metrics_per_novel(titles_path, gold_standard_path, result_path, stats_path): titles = read_file_to_list(titles_path) for title in titles: create_and_save_stats(title, gold_standard_path, result_path, stats_path) for title in titles: metrics = load_from_pickle(title, stats_path) print("*****************") print(title) print(tabulate.tabulate(metrics[1:], headers=metrics[0], tablefmt='orgtbl')) print("*****************")
def prepare_training_data(titles_path, training_set_1_dir_path, training_set_2_path): titles = read_file_to_list(titles_path) train_data = [] for title in titles: data_slice = json_to_spacy_train_data(training_set_1_dir_path + title) train_data.extend(data_slice) data_second_set = json_to_spacy_train_data(training_set_2_path) train_data.extend(data_second_set) return train_data
def main(titles_path, characters_lists_dir_path, novels_texts_dir_path, sentences_per_common_name, training_set_dir): titles = read_file_to_list(titles_path) common_names = get_common_names() number_sentences_to_extracted = sentences_per_common_name * len( common_names) extract_sentences_for_names_injection(titles, number_sentences_to_extracted, characters_lists_dir_path, novels_texts_dir_path, training_set_dir) sentences = [] names_to_be_replaced = [] for title in titles: data = read_sentences_from_file(training_set_dir + "\\extracted_sentences\\" + title) sentences.extend(data) characters = read_file_to_list(characters_lists_dir_path + title) names_to_be_replaced.extend(get_names_to_be_replaced(characters)) training_set, updated_sentences = inject_common_names( common_names, sentences, names_to_be_replaced) with open(training_set_dir + "training_set.json", 'w+') as result: json.dump(training_set, result)
def ner_metrics(titles_path, gold_standard_path, result_path, stats_path): titles = read_file_to_list(titles_path) for title in titles: create_and_save_stats(title, gold_standard_path, result_path, stats_path, ner=True) metrics_table = [] headers = ["Novel title", "precision", "recall", "F-measure", "support"] for title in titles: metrics = load_from_pickle(title, stats_path) metrics_title = [title].__add__([m[0] for m in metrics]) metrics_table.append(metrics_title) metrics = create_overall_stats(titles, gold_standard_path, result_path, stats_path, ner=True) metrics_table.append(["*** overall results ***"].__add__([m[0] for m in metrics])) print(tabulate.tabulate(metrics_table, headers=headers, tablefmt='latex'))
def main(titles_path, names_gold_standard_dir_path, testing_data_dir_path, generated_data_dir, ner_model_dir_path=None): titles = read_file_to_list(titles_path) generate_generalized_data(titles, names_gold_standard_dir_path, generated_data_dir) for title in titles: test_data = read_sentences_from_file(testing_data_dir_path + title) ner_result = test_ner(test_data, ner_model_dir_path) path = generated_data_dir + "ner_model_annotated\\" + title if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) with open(path, 'w+') as result: json.dump(ner_result, result)
def run_matcher(titles_path, model_path, characters_lists_dir_path, texts_dir_path, results_dir, precision=75, tests_variant=True): names_matcher = NamesMatcher(precision, model_path) titles = read_file_to_list(titles_path) for title in titles: if tests_variant: characters, text = get_test_data_for_novel( title, characters_lists_dir_path, texts_dir_path) else: characters, text = get_complete_data_about_novel( title, characters_lists_dir_path, texts_dir_path) matches_table = names_matcher.match_names_for_text(characters, text, results_dir, title, tests_variant, save_ratios=True) print(tabulate(matches_table, tablefmt='orgtbl'))
def main(titles_path, novels_texts_dir_path, number_of_sentences, generated_data_dir): titles = read_file_to_list(titles_path) generate_sample_test_data(titles, number_of_sentences, novels_texts_dir_path, generated_data_dir)
def get_not_recognized_entities(title, not_recognized_named_entities_person_file_path): named_entities = read_file_to_list(not_recognized_named_entities_person_file_path + title) return named_entities
def get_common_names(common_names_path=COMMON_NAMES_FILE): common_names = read_file_to_list(common_names_path) return common_names
def overall_metrics(titles_path, gold_standard_path, result_path, stats_path): titles = read_file_to_list(titles_path) metrics = create_overall_stats(titles, gold_standard_path, result_path, stats_path) print(tabulate.tabulate(metrics[1:], headers=metrics[0], tablefmt='orgtbl'))
def get_complete_data_about_novel(title, characters_lists_dir_path, novels_texts_dir_path): characters = read_file_to_list(characters_lists_dir_path + title) novel_text = read_file(novels_texts_dir_path + title) return characters, novel_text
def get_test_data_for_novel(title, characters_lists_dir_path, testing_sets_dir_path): characters = read_file_to_list(characters_lists_dir_path + title) text = read_sentences_from_file(testing_sets_dir_path + title) return characters, text