def create_phrase_doc_id_map(df, tokenizer, phrase_id_map):
    phrase_docid = {}

    index_word = {}
    for w in tokenizer.word_index:
        index_word[tokenizer.word_index[w]] = w

    id_phrase_map = {}
    for ph in phrase_id_map:
        id_phrase_map[phrase_id_map[ph]] = ph

    for i, row in df.iterrows():
        abstract_str = row["text"]
        phrases = detect_phrase(abstract_str, tokenizer, index_word, id_phrase_map, i)
        for ph in phrases:
            try:
                phrase_docid[ph].add(i)
            except:
                phrase_docid[ph] = {i}
    return phrase_docid
Example #2
0
def get_graph_metapaths(df, tokenizer, id_phrase_map):
    index_word = {}
    for w in tokenizer.word_index:
        index_word[tokenizer.word_index[w]] = w

    existing_fnusts = set()
    for id in id_phrase_map:
        existing_fnusts.add("fnust" + str(id))

    doc_index = []
    for i in range(len(df)):
        doc_index.append("doc" + str(i))
    doc_nodes = IndexedArray(np.array([[-1]] * len(df)), index=doc_index)

    fnust_id, id_fnust, fnust_graph_node_count = make_phrases_map(df, tokenizer, index_word, id_phrase_map)
    phrase_index = []
    for i in range(len(df), fnust_graph_node_count):
        phrase_index.append("phrase" + str(i))
    phrase_nodes = IndexedArray(np.array([[-1]] * len(phrase_index)), index=phrase_index)
    print(len(existing_fnusts - set(fnust_id.keys())))

    author_id, id_author, auth_graph_node_count = make_authors_map(df, fnust_graph_node_count)
    author_index = []
    for i in range(fnust_graph_node_count, auth_graph_node_count):
        author_index.append("author" + str(i))
    author_nodes = IndexedArray(np.array([[-1]] * len(author_index)), index=author_index)

    year_id, id_year, year_graph_node_count = make_years_map(df, auth_graph_node_count)
    year_index = []
    for i in range(auth_graph_node_count, year_graph_node_count):
        year_index.append("year" + str(i))
    year_nodes = IndexedArray(np.array([[-1]] * len(year_index)), index=year_index)

    source_nodes_list = []
    target_nodes_list = []
    for i, row in df.iterrows():
        abstract_str = row["abstract"]
        phrases = detect_phrase(abstract_str, tokenizer, index_word, id_phrase_map, i)
        for ph in phrases:
            source_nodes_list.append("doc" + str(i))
            target_nodes_list.append("phrase" + str(fnust_id[ph]))

        auth_str = row["authors"]
        authors = auth_str.split(",")
        for auth in authors:
            if len(auth) == 0:
                continue
            source_nodes_list.append("doc" + str(i))
            target_nodes_list.append("author" + str(author_id[auth]))

        year = row["year"]
        source_nodes_list.append("doc" + str(i))
        target_nodes_list.append("year" + str(year_id[year]))

    edges = pd.DataFrame({
        "source": source_nodes_list,
        "target": target_nodes_list
    })

    graph = StellarGraph({"doc": doc_nodes, "phrase": phrase_nodes, "author": author_nodes, "year": year_nodes}, edges)
    metapaths = [
        ["doc", "phrase", "doc"],
        ["doc", "author", "doc"],
        ["doc", "year", "doc"],
    ]
    return graph, metapaths