def make_sim_graph(akw, col_graph): """ Takes an {author: author keywords} dict and the collaboration graph for a school and creates a graph where the nodes are authors linked if their keywords are similar. Returns the similarity graph """ sim_graph = nx.Graph() sim_threshold = 0.2 authors = akw.keys() values = akw.values() for i in range (0, len(authors)): author1 = authors[i] # make keywords into set to remove duplicates then back into list to maintain order keywords = list(set(values[i]["keywords"])) # Add the author to the similarity graph add_sim_graph_node(author1, keywords, sim_graph, col_graph) # Get a stemmed version of the author's keywords stemmed1 = (tu.stem_word_list(keywords[:])) # Compare author against each other author in graph for j in range(i+1, len(authors)): author2 = authors[j] keywords2 = list(set(values[j]["keywords"])) add_sim_graph_node(author2, keywords2, sim_graph, col_graph) stemmed2 = (tu.stem_word_list(keywords2[:])) # Check similarity of keywords sim = tu.check_kw_sim(stemmed1, stemmed2) # the similarity score ratio = sim[0] # the indices (in the longest of the two author keyword lists) of the keywords that are similar indices = sim[1] matched_words = [] if len(keywords) > len(keywords2): longest = keywords else: longest = keywords2 # Get the keywords in the indices returned from check_sim for i in indices: matched_words.append(longest[i]) # If similarity score greater than threshold, add edge between authors if ratio > sim_threshold: sim_graph.add_edge(author1, author2, {"num_collabs":ratio, "sim_kw": matched_words}) # Indicate whether authors are actual coauthors if col_graph.has_edge(author1, author2): sim_graph[author1][author2]["areCoauthors"] = True return sim_graph
def test_check_sim(self): kw1 = ["java", "python", "django"] kw2 = ["python", "programming", "django", "graphs"] result = tu.check_kw_sim(kw1, kw2) self.assertEqual(result[0], 0.5) self.assertEqual(result[1], [0, 2])