def create_phrase_doc_id_map(df, tokenizer, phrase_id_map): phrase_docid = {} index_word = {} for w in tokenizer.word_index: index_word[tokenizer.word_index[w]] = w id_phrase_map = {} for ph in phrase_id_map: id_phrase_map[phrase_id_map[ph]] = ph for i, row in df.iterrows(): abstract_str = row["text"] phrases = detect_phrase(abstract_str, tokenizer, index_word, id_phrase_map, i) for ph in phrases: try: phrase_docid[ph].add(i) except: phrase_docid[ph] = {i} return phrase_docid
def get_graph_metapaths(df, tokenizer, id_phrase_map): index_word = {} for w in tokenizer.word_index: index_word[tokenizer.word_index[w]] = w existing_fnusts = set() for id in id_phrase_map: existing_fnusts.add("fnust" + str(id)) doc_index = [] for i in range(len(df)): doc_index.append("doc" + str(i)) doc_nodes = IndexedArray(np.array([[-1]] * len(df)), index=doc_index) fnust_id, id_fnust, fnust_graph_node_count = make_phrases_map(df, tokenizer, index_word, id_phrase_map) phrase_index = [] for i in range(len(df), fnust_graph_node_count): phrase_index.append("phrase" + str(i)) phrase_nodes = IndexedArray(np.array([[-1]] * len(phrase_index)), index=phrase_index) print(len(existing_fnusts - set(fnust_id.keys()))) author_id, id_author, auth_graph_node_count = make_authors_map(df, fnust_graph_node_count) author_index = [] for i in range(fnust_graph_node_count, auth_graph_node_count): author_index.append("author" + str(i)) author_nodes = IndexedArray(np.array([[-1]] * len(author_index)), index=author_index) year_id, id_year, year_graph_node_count = make_years_map(df, auth_graph_node_count) year_index = [] for i in range(auth_graph_node_count, year_graph_node_count): year_index.append("year" + str(i)) year_nodes = IndexedArray(np.array([[-1]] * len(year_index)), index=year_index) source_nodes_list = [] target_nodes_list = [] for i, row in df.iterrows(): abstract_str = row["abstract"] phrases = detect_phrase(abstract_str, tokenizer, index_word, id_phrase_map, i) for ph in phrases: source_nodes_list.append("doc" + str(i)) target_nodes_list.append("phrase" + str(fnust_id[ph])) auth_str = row["authors"] authors = auth_str.split(",") for auth in authors: if len(auth) == 0: continue source_nodes_list.append("doc" + str(i)) target_nodes_list.append("author" + str(author_id[auth])) year = row["year"] source_nodes_list.append("doc" + str(i)) target_nodes_list.append("year" + str(year_id[year])) edges = pd.DataFrame({ "source": source_nodes_list, "target": target_nodes_list }) graph = StellarGraph({"doc": doc_nodes, "phrase": phrase_nodes, "author": author_nodes, "year": year_nodes}, edges) metapaths = [ ["doc", "phrase", "doc"], ["doc", "author", "doc"], ["doc", "year", "doc"], ] return graph, metapaths