Example #1
0
def get_number_of_entities(dataset):
    entities = file.get_entity2id(dataset)
    counter = 0
    for index, row in entities.iterrows():
        if not (row["wikiID"] == "-1"):
            counter += 1

    print(f"{dataset}: {counter}")
def get_dataset_statistics(dataset):
    docs = file.get_sentences(dataset)
    labels = file.get_labels(dataset)
    words = len(file.get_vocab(dataset))

    counter = Counter([t.split("\t")[1] for t in labels])
    labels = len(set([t.split("\t")[2] for t in labels]))

    entities = file.get_entity2id(dataset)
    entities = entities[~(entities["wikiID"] == "-1")]

    result = [dataset, len(docs), counter["train"], counter["test"], words, entities.shape[0], labels]
    result = [str(r) for r in result]
    print(" & ".join(result))
Example #3
0
def create_doc_id_mappings():
    vocab_id = file.get_entity2id(
    )  # Dataframe with "word" and "wikiID" column
    doc_nouns_norm = file.get_normalized_nouns(
    )  # Array with all nouns per doc // must be split
    mappings = []
    for current_doc, doc in enumerate(doc_nouns_norm):
        if doc == "":
            continue
        doc_words = doc.split(" ")
        doc_map = vocab_id[vocab_id["word"].isin(doc_words)]
        doc_ids = doc_map["wikiID"].tolist()
        for wordId in doc_ids:
            if not wordId == "-1":
                mappings.append([current_doc, wordId])
        assert len(set(doc_words)) == len(doc_ids)

    file.save_doc2id(mappings)
Example #4
0
def analyze_relations(dataset, id1, id2):
    doc2id = file.get_doc2id(dataset)
    vocab = file.get_entity2id(dataset)
    docs = file.get_sentences(dataset)
    triples = file.get_document_triples(dataset)
    filtered = file.get_filtered_triples(dataset)
    all_props = file.get_all_relations()

    connections = triples[(triples["doc1"] == id1) & (triples["doc2"] == id2)]
    details = connections["detail"].tolist()[0].split("+")
    assert len(details) == connections["relations"].tolist()[0]

    doc1 = docs[id1]
    doc2 = docs[id2]

    doc1_ids = doc2id[doc2id["doc"] == id1]["wikiID"].tolist()
    doc2_ids = doc2id[doc2id["doc"] == id2]["wikiID"].tolist()

    all_relations = doc1_ids + doc2_ids
    all_relations = list(dict.fromkeys(all_relations))

    entities_doc1 = []
    entities_doc2 = []
    result1 = filtered[(filtered["entity1"].isin(doc1_ids)) & (filtered["entity2"].isin(doc2_ids))]
    result2 = filtered[(filtered["entity2"].isin(doc1_ids)) & (filtered["entity1"].isin(doc2_ids))]
    merged_results = pd.concat([result1, result2]).reset_index(drop=True)
    assert merged_results.shape[0] == connections["relations"].tolist()[0]

    for relation in doc1_ids:
        word = vocab[vocab["wikiID"] == relation]["word"].tolist()
        entities_doc1.append([word, relation])

    for relation in doc2_ids:
        word = vocab[vocab["wikiID"] == relation]["word"].tolist()
        entities_doc2.append([word, relation])

    count1 = 0
    count2 = 0

    for w in entities_doc1:
        word = w[0][0]
        if word in doc1 and len(word) > 1:
            count1 += 1
            doc1 = doc1.replace(word, "\hl{" + word + "}")

    for w in entities_doc2:
        word = w[0][0]
        if word in doc2 and len(word) > 1:
            count2 += 1
            doc2 = doc2.replace(word, "\hl{" + word + "}")

    print(doc1)
    print("\n\n\n")
    print(doc2)

    print(entities_doc1)
    print(merged_results)

    labeld_aray = []
    for index, row in merged_results.iterrows():
        entity1 = row["entity1"]
        entity2 = row["entity2"]
        rel = row["relations"]

        word1 = vocab[vocab["wikiID"] == entity1]["word"].tolist()[0]
        word2 = vocab[vocab["wikiID"] == entity2]["word"].tolist()[0]
        desc = all_props[all_props["ID"] == rel]["label"].tolist()[0]
        labeld_aray.append([word1, desc, word2])

    labeled_df = pd.DataFrame(labeld_aray)
    print(labeled_df)
Example #5
0
def get_vocab_ids():
    entity2id_df = file.get_entity2id()
    unmapped_entities = entity2id_df[entity2id_df["wikiID"] == "-1"].index
    entity2id_df.drop(unmapped_entities, inplace=True)
    return entity2id_df["wikiID"].to_numpy()
def get_detailed_relations(id1, id2, dataset):
    filtered_triples = file.get_filtered_triples(dataset)  # Triples
    all_entities = file.get_entity2id(dataset)
    all_relations = file.get_all_relations()
    ids = file.get_doc2id(dataset)

    doc_nouns_norm = file.get_normalized_nouns(dataset)  # Array with all nouns per doc // must be split
    nouns1 = doc_nouns_norm[id1]
    nouns2 = doc_nouns_norm[id2]
    all_nouns = [nouns1, nouns2]

    triples = []
    triples_detail = []
    indices = [id1, id2]
    for noun_index, doc in enumerate(all_nouns):
        doc_index = indices[noun_index]
        if doc == "":
            continue

        # All ID's of the normalized nouns in the current document
        doc_ids = ids[ids["doc"] == doc_index]["wikiID"].tolist()
        assert len(doc_ids) <= len(doc.split(" ")), f"{len(doc.split(' '))} vs. {len(doc_ids)} in {dataset}"

        # Graph edges pointing to other entities
        triples_out = filtered_triples[filtered_triples["entity1"].isin(doc_ids)]
        triples_in = filtered_triples[filtered_triples["entity2"].isin(doc_ids)]
        triples_in.columns = ["entity2", "relations", "entity1"]

        triples_total = pd.concat([triples_out, triples_in])

        doc_pointers = {}
        for index, row in triples_total.iterrows():
            entity1 = row["entity1"]
            relation = row["relations"]
            entity2 = row["entity2"]

            # Look in which documents entity2 appears
            pointer = ids[ids["wikiID"] == entity2]["doc"].tolist()
            assert entity1 in doc_ids

            for doc_id in pointer:
                # Ignore doc2doc edges to doc itself
                if doc_id <= doc_index:
                    continue

                if not (doc_id == id2 and doc_index == id1):
                    continue

                triples_detail.append([entity1, relation, entity2, doc_index, doc_id])

                if doc_id in doc_pointers:
                    doc_pointers[doc_id].append(relation)
                else:
                    doc_pointers[doc_id] = [relation]

        for key in doc_pointers.keys():
            # Filter out all docs with length below 2
            if len(doc_pointers[key]) > 1:
                triples.append([doc_index, key, len(doc_pointers[key]), "+".join(doc_pointers[key])])

    assert len(triples) == 1
    triples = triples[0]
    doc1 = triples[0]
    doc2 = triples[1]

    check = file.get_document_triples(dataset)
    selected = check[(check["doc1"] == doc1) & (check["doc2"] == doc2)]

    detailed_results = pd.DataFrame(triples_detail, columns=["entity1", "relation", "entity2", "doc1", "doc2"])

    assert detailed_results.shape[0] == selected["relations"].tolist()[0]
    assert detailed_results["relation"].tolist() == selected["detail"].tolist()[0].split("+")

    triples_readable = []
    for index, row in detailed_results.iterrows():
        assert row["doc1"] == id1 and row["doc2"] == id2

        word1 = all_entities[all_entities["wikiID"] == row["entity1"]]["word"].tolist()
        assert len(word1) > 0

        word2 = all_entities[all_entities["wikiID"] == row["entity2"]]["word"].tolist()
        assert len(word2) > 0

        relation_detail = all_relations[all_relations["ID"] == row["relation"]]["label"].tolist()
        assert len(relation_detail) == 1
        relation_detail = relation_detail[0]

        triples_readable.append([", ".join(word1), relation_detail, ", ".join(word2)])

    readable_triples = pd.DataFrame(triples_readable, columns=["entity1", "relation", "entity2"])

    is_equal, stats = get_relation_statistics(id1, id2, readable_triples, detailed_results, dataset)
    return is_equal, stats