def test_fuzzy_match_one_candidate_substring_matches_case_1(): expanded = ['Marques Mendes'] candidates = [{ 'wiki': 'Q550243', 'label': 'Luís Marques Mendes', 'aliases': ['Luís Manuel Gonçalves Marques Mendes'] }] assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_fuzzy_match_one_candidate_clean_string_matches_case_1(): expanded = ['José Pedro Aguiar-Branco'] candidates = [{ 'wiki': 'Q1555060', 'label': 'José Pedro Aguiar Branco', 'aliases': ['José Pedro Correia de Aguiar Branco'] }] assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_fuzzy_match_one_candidate_substring_matches_case_3(): expanded = ['Ribeiro e Castro'] candidates = [{ 'wiki': 'Q1386216', 'label': 'José Ribeiro e Castro', 'aliases': ['José Duarte de Almeida Ribeiro e Castro'] }] assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_fuzzy_match_one_candidate_substring_matches_case_2(): expanded = ['Morais Castro'] candidates = [{ 'wiki': 'Q934980', 'label': 'José Morais e Castro', 'aliases': None }] assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_fuzzy_match_one_candidate_substring_matches_case_4(): expanded = ['António Marinho'] candidates = [{ 'wiki': 'Q611182', 'label': 'Marinho Pinto', 'aliases': [ 'António Marinho Pinto', 'António Marinho e Pinto', 'António de Sousa Marinho e Pinto' ] }] assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_merge_substrings(): result = EntityLinking.merge_substrings( ['Luís Filipe Menezes', 'Dr. Menezes', 'doutor Menezes']) assert result == ['Luís Filipe Menezes'] result = EntityLinking.merge_substrings( ['Pedro Silva Pereira', '”Pedro Silva Pereira']) assert result == ['Pedro Silva Pereira'] result = EntityLinking.merge_substrings( ['Luís Filipe Menezes', 'Dr. Menezes', 'doutor Menezes']) assert result == ['Luís Filipe Menezes'] result = EntityLinking.merge_substrings( ['Luís Marques Mendes', 'Marques Mendes']) assert result == ['Luís Marques Mendes'] result = EntityLinking.merge_substrings( ['Filipe Anacoreta Correia', 'Anacoreta Correia']) assert result == ['Filipe Anacoreta Correia'] result = EntityLinking.merge_substrings( ['Freitas do Amaral', 'Diogo Freitas do Amaral']) assert result == ['Diogo Freitas do Amaral'] result = EntityLinking.merge_substrings(['George Bush', 'George W. Bush']) assert result == ['George W. Bush'] result = EntityLinking.merge_substrings(['Víto Gaspar', 'Vítor Gaspar']) assert result == ['Vítor Gaspar'] result = EntityLinking.merge_substrings( ['Jerónimo Sousa', 'Jerónimo de Sousa']) assert result == ['Jerónimo de Sousa'] result = EntityLinking.merge_substrings( ['Nicolas Maduro', 'Nicolás Maduro']) assert result == ['Nicolás Maduro']
def main(): args = parse_args() if args.publico: f_name = args.publico if args.chave: f_name = args.chave if args.arquivo: f_name = args.arquivo # load the relationships classification model print("Loading relationship classifier...") relationship_clf = joblib.load(MODELS + "SVC_2021_06_19_03_12.joblib") tf_idf_vectorizer = joblib.load(MODELS + "tf_idf_weights_2021_06_19_03_12.joblib") print("Loading NER classifier") ner = get_ner() # ToDo: load named-entities that should be ignored in the NER model itself with open('../classifiers/ner/names_ignore.txt', 'rt') as f_in: ner_ignore = [line.strip() for line in f_in.readlines()] print("Loading relation direction classifier") direction_clf = DirectionClassifier() print("Loading Entity Linking") articles_db = ArticlesDB() mappings = { "Cavaco": "Aníbal Cavaco Silva", "Marques Mendes": "Luís Marques Mendes", } el = EntityLinking(ner, articles_db, mappings) # log everything for error analysis ner_ignored = jsonlines.open("ner_ignored.jsonl", mode="w") no_entities = jsonlines.open("titles_processed_no_entities.jsonl", mode="w") more_entities = jsonlines.open("titles_processed_more_entities.jsonl", mode="w") processed = jsonlines.open("titles_processed.jsonl", mode="w") ner_linked = jsonlines.open("ner_linked.jsonl", mode="w") processing_errors = jsonlines.open("processing_errors.jsonl", mode="w") count = 0 with open(f_name, 'rt') as f_in: for line in f_in: if args.publico: entry = line.split('\t') date = entry[0] url = entry[1] title = entry[2] elif args.arquivo or args.chave: entry = json.loads(line) title = entry["title"] url = entry["linkToArchive"] date = entry["tstamp"] count += 1 if count % 1000 == 0: print(count) try: cleaned_title = clean_title_re(title) except Exception as e: print(e, title) continue # named-entity recognition persons = ner.tag(cleaned_title) # ignore certain 'person' entities # ToDo: move this to the ner object if any(person in persons for person in ner_ignore): ner_ignored.write({ "title": cleaned_title, "entities": persons }) continue if len(persons) <= 1: no_entities.write({ "title": cleaned_title, "entities": persons }) continue if len(persons) > 2: more_entities.write({ "title": cleaned_title, "entities": persons }) continue # entity linking entity1_wiki = el.entity_linking(persons[0], url) entity2_wiki = el.entity_linking(persons[1], url) # relationship extraction labels = ['opposes', 'other', 'supports'] from politiquices.nlp.classifiers.relationship.train_clf_linear import get_text_tokens sample = { 'title': cleaned_title, 'ent1': persons[0], 'ent2': persons[1] } try: textual_context = get_text_tokens([sample], tokenized=True) except TypeError: processing_errors.write(sample) continue tf_idf_weights = tf_idf_vectorizer.transform(textual_context) predicted_probs = relationship_clf.predict_proba(tf_idf_weights) rel_type_scores = { label: float(pred) for label, pred in zip(labels, predicted_probs[0]) } pred_rel = max(rel_type_scores, key=rel_type_scores.get) if pred_rel != 'other': # detect relationship direction pred, pattern, context, pos_tags = direction_clf.detect_direction( cleaned_title, persons[0], persons[1]) pred_rel = pred.replace("rel", pred_rel) result = { "title": cleaned_title, "entities": persons, "ent_1": entity1_wiki, "ent_2": entity2_wiki, "scores": rel_type_scores, "pred_rel": pred_rel, "url": url, "date": date, } if entity1_wiki and entity2_wiki: processed.write(result) ner_linked.write({ "ner": persons[0], "wiki": result['ent_1'], "url": url }) ner_linked.write({ "ner": persons[1], "wiki": result['ent_2'], "url": url })
from politiquices.nlp.classifiers.entity_linking.entitly_linking_clf import EntityLinking from politiquices.nlp.data_sources.articles_db import ArticlesDB from politiquices.nlp.extraction_pipeline.extract_relationships import get_ner from politiquices.nlp.utils.utils import read_ground_truth, write_iterator_to_file from sklearn.metrics import accuracy_score mappings = { "Cavaco": "Aníbal Cavaco Silva", "Marques Mendes": "Luís Marques Mendes", } articles_db = ArticlesDB() ner = get_ner() el = EntityLinking(ner, articles_db, mappings) all_ent_surface_string = [] ent_surface_string_with_wiki = [] ent_surface_string_without_wiki = [] ent_true = [] ent_pred = [] freqs = defaultdict(list) def evaluate_one(entity_str, entity_id, url): res = el.entity_linking(entity_str, url) # 'None' as str for accuracy_sore() to work true = entity_id.split("/")[-1] if entity_id else 'None' pred = res['wiki_id'] if res else 'None' ent_true.append(true)