def get_number_of_entities(dataset): entities = file.get_entity2id(dataset) counter = 0 for index, row in entities.iterrows(): if not (row["wikiID"] == "-1"): counter += 1 print(f"{dataset}: {counter}")
def get_dataset_statistics(dataset): docs = file.get_sentences(dataset) labels = file.get_labels(dataset) words = len(file.get_vocab(dataset)) counter = Counter([t.split("\t")[1] for t in labels]) labels = len(set([t.split("\t")[2] for t in labels])) entities = file.get_entity2id(dataset) entities = entities[~(entities["wikiID"] == "-1")] result = [dataset, len(docs), counter["train"], counter["test"], words, entities.shape[0], labels] result = [str(r) for r in result] print(" & ".join(result))
def create_doc_id_mappings(): vocab_id = file.get_entity2id( ) # Dataframe with "word" and "wikiID" column doc_nouns_norm = file.get_normalized_nouns( ) # Array with all nouns per doc // must be split mappings = [] for current_doc, doc in enumerate(doc_nouns_norm): if doc == "": continue doc_words = doc.split(" ") doc_map = vocab_id[vocab_id["word"].isin(doc_words)] doc_ids = doc_map["wikiID"].tolist() for wordId in doc_ids: if not wordId == "-1": mappings.append([current_doc, wordId]) assert len(set(doc_words)) == len(doc_ids) file.save_doc2id(mappings)
def analyze_relations(dataset, id1, id2): doc2id = file.get_doc2id(dataset) vocab = file.get_entity2id(dataset) docs = file.get_sentences(dataset) triples = file.get_document_triples(dataset) filtered = file.get_filtered_triples(dataset) all_props = file.get_all_relations() connections = triples[(triples["doc1"] == id1) & (triples["doc2"] == id2)] details = connections["detail"].tolist()[0].split("+") assert len(details) == connections["relations"].tolist()[0] doc1 = docs[id1] doc2 = docs[id2] doc1_ids = doc2id[doc2id["doc"] == id1]["wikiID"].tolist() doc2_ids = doc2id[doc2id["doc"] == id2]["wikiID"].tolist() all_relations = doc1_ids + doc2_ids all_relations = list(dict.fromkeys(all_relations)) entities_doc1 = [] entities_doc2 = [] result1 = filtered[(filtered["entity1"].isin(doc1_ids)) & (filtered["entity2"].isin(doc2_ids))] result2 = filtered[(filtered["entity2"].isin(doc1_ids)) & (filtered["entity1"].isin(doc2_ids))] merged_results = pd.concat([result1, result2]).reset_index(drop=True) assert merged_results.shape[0] == connections["relations"].tolist()[0] for relation in doc1_ids: word = vocab[vocab["wikiID"] == relation]["word"].tolist() entities_doc1.append([word, relation]) for relation in doc2_ids: word = vocab[vocab["wikiID"] == relation]["word"].tolist() entities_doc2.append([word, relation]) count1 = 0 count2 = 0 for w in entities_doc1: word = w[0][0] if word in doc1 and len(word) > 1: count1 += 1 doc1 = doc1.replace(word, "\hl{" + word + "}") for w in entities_doc2: word = w[0][0] if word in doc2 and len(word) > 1: count2 += 1 doc2 = doc2.replace(word, "\hl{" + word + "}") print(doc1) print("\n\n\n") print(doc2) print(entities_doc1) print(merged_results) labeld_aray = [] for index, row in merged_results.iterrows(): entity1 = row["entity1"] entity2 = row["entity2"] rel = row["relations"] word1 = vocab[vocab["wikiID"] == entity1]["word"].tolist()[0] word2 = vocab[vocab["wikiID"] == entity2]["word"].tolist()[0] desc = all_props[all_props["ID"] == rel]["label"].tolist()[0] labeld_aray.append([word1, desc, word2]) labeled_df = pd.DataFrame(labeld_aray) print(labeled_df)
def get_vocab_ids(): entity2id_df = file.get_entity2id() unmapped_entities = entity2id_df[entity2id_df["wikiID"] == "-1"].index entity2id_df.drop(unmapped_entities, inplace=True) return entity2id_df["wikiID"].to_numpy()
def get_detailed_relations(id1, id2, dataset): filtered_triples = file.get_filtered_triples(dataset) # Triples all_entities = file.get_entity2id(dataset) all_relations = file.get_all_relations() ids = file.get_doc2id(dataset) doc_nouns_norm = file.get_normalized_nouns(dataset) # Array with all nouns per doc // must be split nouns1 = doc_nouns_norm[id1] nouns2 = doc_nouns_norm[id2] all_nouns = [nouns1, nouns2] triples = [] triples_detail = [] indices = [id1, id2] for noun_index, doc in enumerate(all_nouns): doc_index = indices[noun_index] if doc == "": continue # All ID's of the normalized nouns in the current document doc_ids = ids[ids["doc"] == doc_index]["wikiID"].tolist() assert len(doc_ids) <= len(doc.split(" ")), f"{len(doc.split(' '))} vs. {len(doc_ids)} in {dataset}" # Graph edges pointing to other entities triples_out = filtered_triples[filtered_triples["entity1"].isin(doc_ids)] triples_in = filtered_triples[filtered_triples["entity2"].isin(doc_ids)] triples_in.columns = ["entity2", "relations", "entity1"] triples_total = pd.concat([triples_out, triples_in]) doc_pointers = {} for index, row in triples_total.iterrows(): entity1 = row["entity1"] relation = row["relations"] entity2 = row["entity2"] # Look in which documents entity2 appears pointer = ids[ids["wikiID"] == entity2]["doc"].tolist() assert entity1 in doc_ids for doc_id in pointer: # Ignore doc2doc edges to doc itself if doc_id <= doc_index: continue if not (doc_id == id2 and doc_index == id1): continue triples_detail.append([entity1, relation, entity2, doc_index, doc_id]) if doc_id in doc_pointers: doc_pointers[doc_id].append(relation) else: doc_pointers[doc_id] = [relation] for key in doc_pointers.keys(): # Filter out all docs with length below 2 if len(doc_pointers[key]) > 1: triples.append([doc_index, key, len(doc_pointers[key]), "+".join(doc_pointers[key])]) assert len(triples) == 1 triples = triples[0] doc1 = triples[0] doc2 = triples[1] check = file.get_document_triples(dataset) selected = check[(check["doc1"] == doc1) & (check["doc2"] == doc2)] detailed_results = pd.DataFrame(triples_detail, columns=["entity1", "relation", "entity2", "doc1", "doc2"]) assert detailed_results.shape[0] == selected["relations"].tolist()[0] assert detailed_results["relation"].tolist() == selected["detail"].tolist()[0].split("+") triples_readable = [] for index, row in detailed_results.iterrows(): assert row["doc1"] == id1 and row["doc2"] == id2 word1 = all_entities[all_entities["wikiID"] == row["entity1"]]["word"].tolist() assert len(word1) > 0 word2 = all_entities[all_entities["wikiID"] == row["entity2"]]["word"].tolist() assert len(word2) > 0 relation_detail = all_relations[all_relations["ID"] == row["relation"]]["label"].tolist() assert len(relation_detail) == 1 relation_detail = relation_detail[0] triples_readable.append([", ".join(word1), relation_detail, ", ".join(word2)]) readable_triples = pd.DataFrame(triples_readable, columns=["entity1", "relation", "entity2"]) is_equal, stats = get_relation_statistics(id1, id2, readable_triples, detailed_results, dataset) return is_equal, stats