def test_generic_weighted_projected_graph(self): def shared(unbrs, vnbrs): return len(unbrs & vnbrs) B = nx.path_graph(5) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4], weight_function=shared) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, { 'weight': 1 }), (2, 4, { 'weight': 1 })]) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4]) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, { 'weight': 1 }), (2, 4, { 'weight': 1 })]) B = nx.DiGraph() B.add_path(list(range(5))) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4]) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, { 'weight': 1 }), (2, 4, { 'weight': 1 })])
def test_generic_weighted_projected_graph_simple(self): def shared(G, u, v): return len(set(G[u]) & set(G[v])) B = nx.path_graph(5) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4], weight_function=shared) assert_nodes_equal(list(G), [0, 2, 4]) assert_edges_equal(list(list(G.edges(data=True))), [(0, 2, { 'weight': 1 }), (2, 4, { 'weight': 1 })]) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4]) assert_nodes_equal(list(G), [0, 2, 4]) assert_edges_equal(list(list(G.edges(data=True))), [(0, 2, { 'weight': 1 }), (2, 4, { 'weight': 1 })]) B = nx.DiGraph() B.add_path(list(range(5))) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4]) assert_nodes_equal(list(G), [0, 2, 4]) assert_edges_equal(list(G.edges(data=True)), [(0, 2, { 'weight': 1 }), (2, 4, { 'weight': 1 })])
def misc_1(): def jaccard(G, u, v): unbrs = set(G[u]) vnbrs = set(G[v]) return float(len(unbrs & vnbrs)) / len(unbrs | vnbrs) def my_weight(G, u, v, weight='weight'): w = 0 print('@@@@@@@@@@@@@@@@') print(G) print((G[u])) print(type(u)) print(G.edges()) print(set(G[u]) & set(G[v])) for nbr in set(G[u]) & set(G[v]): print('{{{{{{{{{{{{{{{{{{') print((nbr)) # print(G[u][nbr].get(weight, 6)) # x0, y0 = G.node[edge[0]]['pos'] w += G[u][nbr][weight] + G[v][nbr][weight] print('w=', w) # w += G[u][nbr].get(weight, 1) + G[v][nbr].get(weight, 1) # w += G.edge[u][nbr].get(weight, 1) + G.edge[v][nbr].get(weight, 1) return w B = nx.complete_bipartite_graph(2, 2) # B = nx.complete_bipartite_graph(3, 3) print('iiiiiiiiiiiiiiiiii') for edge in B.edges(data=True): print(edge) j = 1 for i in B.edges(data=True): print('///////////////') # B[i[0]][i[1]]['weight'] = 22 # print(B[i[0]]) # print(B[i[0]][i[1]]) i[2]['weight'] = j # B[i[0]][i[1]]['weight'] = 22 does the same thing j = j + 1 for edge in B.edges(data=True): print(edge) G = bipartite.generic_weighted_projected_graph(B, [0, 1]) # bi = graphx() # bi.plot_graph(B,'complete') # bi.plot_graph(G,'complete') print(G.edges(data=True)) for edge in G.edges(data=True): # print() print(edge) G = bipartite.generic_weighted_projected_graph(B, [0, 1], weight_function=my_weight) print('Final value') print(G.edges(data=True))
def splitBipartiteGexf(inputGexf, outputGexfPath): outputGexfPath = outputGexfPath + os.sep jr["input_gexf"] = inputGexf jr["outputGexfPath"] = outputGexfPath # otuput files xgexf = os.path.join(dirname(outputGexfPath), basename(splitext(inputGexf)[0])) + ".x.gexf" ygexf = os.path.join(dirname(outputGexfPath), basename(splitext(inputGexf)[0])) + ".y.gexf" try: graph = nx.readwrite.gexf.read_gexf(inputGexf) except: throwError("unable to read gexf file") return # bug in networkx, we need to make the directed graph as undirected graph = graph.to_undirected() jr["numOfNodes"] = len(graph.nodes()) jr["numOfEdges"] = len(graph.edges()) X, Y = bipartite.sets(graph) print "biparte.sets..." print X print Y #xgr=project_bipartite_graph(graph,X,"weight") xgr = bipartite.generic_weighted_projected_graph(graph, X) print "biparte.xgr..." print len(xgr.nodes()) print len(xgr.edges()) try: nx.readwrite.gexf.write_gexf(xgr, xgexf) except: throwError("unable to write file, path:'" + xgexf + "'") return #ygr=project_bipartite_graph(graph,Y,"weight") ygr = bipartite.generic_weighted_projected_graph(graph, Y) print "biparte.ygr..." print len(ygr.nodes()) print len(ygr.edges()) try: nx.readwrite.gexf.write_gexf(ygr, ygexf) except: throwError("unable to write file, path:'" + ygexf + "'") #print sys.exc_info() jr['output_gexf'] = [xgexf, ygexf] print "nodes in X", xgr.nodes() print "edges in X", list(xgr.edges()) print "nodes in Y", ygr.nodes()
def splitBipartiteGexf( inputGexf, outputGexfPath ): outputGexfPath = outputGexfPath + os.sep jr["input_gexf"] = inputGexf jr["outputGexfPath"] = outputGexfPath # otuput files xgexf = os.path.join( dirname( outputGexfPath ), basename( splitext( inputGexf )[0] ) )+".x.gexf" ygexf = os.path.join( dirname( outputGexfPath ), basename( splitext( inputGexf )[0] ) )+".y.gexf" try: graph = nx.readwrite.gexf.read_gexf( inputGexf ); except: throwError( "unable to read gexf file" ) return # bug in networkx, we need to make the directed graph as undirected graph=graph.to_undirected() jr["numOfNodes"] = len( graph.nodes() ) jr["numOfEdges"] = len( graph.edges() ) X,Y=bipartite.sets(graph) print "biparte.sets..." print X print Y #xgr=project_bipartite_graph(graph,X,"weight") xgr=bipartite.generic_weighted_projected_graph(graph,X) print "biparte.xgr..." print len(xgr.nodes()) print len(xgr.edges()) try: nx.readwrite.gexf.write_gexf(xgr, xgexf ) except: throwError( "unable to write file, path:'" + xgexf + "'" ) return #ygr=project_bipartite_graph(graph,Y,"weight") ygr=bipartite.generic_weighted_projected_graph(graph,Y) print "biparte.ygr..." print len(ygr.nodes()) print len(ygr.edges()) try: nx.readwrite.gexf.write_gexf(ygr, ygexf ) except: throwError( "unable to write file, path:'" + ygexf + "'" ) #print sys.exc_info() jr['output_gexf'] = [ xgexf, ygexf ] print "nodes in X", xgr.nodes() print "edges in X", list( xgr.edges() ) print "nodes in Y", ygr.nodes()
def generic_weighted_projected_graph(self): E = bipartite.sets(self.B)[0] P = bipartite.generic_weighted_projected_graph(self.B, E) self.plot_graph_2(P, 'generic_weighted_projected_graph') print('generic_weighted_projected_graph:number of edges:', P.number_of_edges()) print(P.edges()) print(list(P.edges(data=True)))
def test_generic_weighted_projected_graph_simple(self): def shared(G, u, v): return len(set(G[u]) & set(G[v])) B = nx.path_graph(5) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4], weight_function=shared) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, {"weight": 1}), (2, 4, {"weight": 1})]) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4]) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, {"weight": 1}), (2, 4, {"weight": 1})]) B = nx.DiGraph() B.add_path(list(range(5))) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4]) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, {"weight": 1}), (2, 4, {"weight": 1})])
def test_generic_weighted_projected_graph(self): def shared(unbrs, vnbrs): return len(unbrs & vnbrs) B = nx.path_graph(5) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4], weight_function=shared) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, {'weight': 1}), (2, 4, {'weight': 1})] ) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4]) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, {'weight': 1}), (2, 4, {'weight': 1})] ) B = nx.DiGraph() B.add_path(list(range(5))) G = bipartite.generic_weighted_projected_graph(B, [0, 2, 4]) assert_equal(sorted(G.nodes()), [0, 2, 4]) assert_equal(G.edges(data=True), [(0, 2, {'weight': 1}), (2, 4, {'weight': 1})] )
def projected_graph(self): if not self.B: self.create_bipartite_graph() bottom = bipartite.sets(self.B)[0] G = bipartite.generic_weighted_projected_graph( self.B, bottom, weight_function=self.projection_weight) return G
def project(graph, nodes): """ 对图进行投影 :param graph: :param nodes: :return: """ prj_graph = bipartite.generic_weighted_projected_graph( graph, nodes, weight_function=my_weight) for node in prj_graph.nodes: prj_graph.nodes[node]["weight"] = degrees(prj_graph, node) return prj_graph
def test_generic_weighted_projected_graph_custom(self): def jaccard(G, u, v): unbrs = set(G[u]) vnbrs = set(G[v]) return float(len(unbrs & vnbrs)) / len(unbrs | vnbrs) def my_weight(G, u, v, weight='weight'): w = 0 for nbr in set(G[u]) & set(G[v]): w += G.edge[u][nbr].get(weight, 1) + G.edge[v][nbr].get(weight, 1) return w B = nx.complete_bipartite_graph(2,2) for i,(u,v) in enumerate(B.edges()): B.edge[u][v]['weight'] = i + 1 G = bipartite.generic_weighted_projected_graph(B, [0, 1], weight_function=jaccard) assert_edges_equal(G.edges(data=True), [(0, 1, {'weight': 1.0})]) G = bipartite.generic_weighted_projected_graph(B, [0, 1], weight_function=my_weight) assert_edges_equal(G.edges(data=True), [(0, 1, {'weight': 10})]) G = bipartite.generic_weighted_projected_graph(B, [0, 1]) assert_edges_equal(G.edges(data=True), [(0, 1, {'weight': 2})])
def test_generic_weighted_projected_graph_custom(self): def jaccard(G, u, v): unbrs = set(G[u]) vnbrs = set(G[v]) return float(len(unbrs & vnbrs)) / len(unbrs | vnbrs) def my_weight(G, u, v, weight="weight"): w = 0 for nbr in set(G[u]) & set(G[v]): w += G.edge[u][nbr].get(weight, 1) + G.edge[v][nbr].get(weight, 1) return w B = nx.complete_bipartite_graph(2, 2) for i, (u, v) in enumerate(B.edges()): B.edge[u][v]["weight"] = i + 1 G = bipartite.generic_weighted_projected_graph(B, [0, 1], weight_function=jaccard) assert_equal(G.edges(data=True), [(0, 1, {"weight": 1.0})]) G = bipartite.generic_weighted_projected_graph(B, [0, 1], weight_function=my_weight) assert_equal(G.edges(data=True), [(0, 1, {"weight": 10})]) G = bipartite.generic_weighted_projected_graph(B, [0, 1]) assert_equal(G.edges(data=True), [(0, 1, {"weight": 2})])
def fold_network(network, nodes, mode='multi'): """ Folds the network from a two-mode representation to a one-mode representation. :param network: networkx.Graph Bipartite graph to be folded :param nodes: list of nodes The node set to keep :param mode: str, optional 'multi' for getting an edge weight dictionary 'single' for getting a single weight per edge Default to 'multi' :return: nw: networkx.Graph The folded network """ if mode == 'multi': return bipartite.generic_weighted_projected_graph( network, nodes, weight_function=multi_weight_function) elif mode == 'single': return bipartite.generic_weighted_projected_graph( network, nodes, weight_function=single_weight_function) else: return None
def project_graph(name='bipartite_reader_network.pickle', method="Count"): """ Create the projected graph, with weights. :param book_weights_dict: the weights dictionary, which is of the form {(title1_gid, title2_gid) : weight, ...} :param method: This tells us how to weight the edges. "Rating count" sums all the ratings for a weight. "Average" takes the average. "Count" just counts the number of times the edge is shared (co-read). :return: A nx graph. """ print("Projecting Graph with {} method.".format(method)) bi_graph = read(name) if not bipartite.is_bipartite(bi_graph): raise Exception("Projecting non-bipartite graphs is felony.") # Make top nodes (users) to project down onto bottom nodes (books) top_nodes = { n for n, d in bi_graph.nodes(data=True) if d['bipartite'] == 0 } bottom_nodes = set(bi_graph) - top_nodes # Various projection methods if method == "Count": # Count the number of co-reads proj_graph = bipartite.generic_weighted_projected_graph( bi_graph, bottom_nodes) elif method == "Collaboration": # Newman's collaboration metric proj_graph = bipartite.collaboration_weighted_projected_graph( bi_graph, bottom_nodes) elif method == "Overlap": # Proportion of neighbors that are shared proj_graph = bipartite.overlap_weighted_projected_graph( bi_graph, bottom_nodes) elif method == "Average Weight": # todo proj_graph = bipartite.collaboration_weighted_projected_graph( bi_graph, bottom_nodes) elif method == "Divergence": # todo proj_graph = bipartite.collaboration_weighted_projected_graph( bi_graph, bottom_nodes) else: raise Exception("{} is not a valid projection method".format(method)) # Save print("Saving projection_graph_{}.pickle".format(method)) overwrite(proj_graph, "projection_graph_{}.pickle".format(method)) print("Saving projection_graph_{}.gml".format(method)) nx.write_gml(proj_graph, "projection_graph_{}.gml".format(method)) return proj_graph
def creazioneProiezione(g): """ Dal grafo bipartito (User-Tag) vado a generare la proiezione sugli users :param g: :type g: Graph :return: """ def my_weight(G, u, v, weight='weight'): w = 0 for nbr in set(G[u]) & set(G[v]): w += G.edge[u][nbr].get(weight, 1) + G.edge[v][nbr].get(weight, 1) return w print("\nVado a creare prima il grafo bipartito e poi la proiezione sugli utenti!") # Passo attraverso Networkx B=nx.read_graphml(path=pathOutput+fileNameGraphML+".graphml") if nx.is_connected(B): top_nodes = set((n,d["gender"]) for n,d in B.nodes(data=True) if d['bipartite']==1) bottom_nodes = set(B) - top_nodes print("\nGrafo bipartito?: {}".format(nx.is_bipartite(B))) print("NODI: {}".format(list(top_nodes)[:10])) G = bipartite.generic_weighted_projected_graph(B,bottom_nodes,weight_function=my_weight) print("\nArchi: {}".format(G.edges(data=True)[:10]))
def main(): # initialize gender detector gender_detector = gender.Detector() # loop over universities for university in UNIVERSITIES: # format query params = { 'q': '{"_and":[{"_gte":{"patent_date":"%s"}},{"_lt":{"patent_date":"%s"}},{"assignee_id":"%s"}]}' % (DATE_START, DATE_END, university["assignee_id"]), 'f': '["patent_number","patent_date","patent_title","inventor_id","inventor_first_name","inventor_last_name", "assignee_organization", "cited_patent_number", "citedby_patent_number"]', 'o': '{"per_page":%s}' % (PER_PAGE, ) } # make api request request = requests.get(BASE_URL, params=params) # save response as dict data = json.loads(request.text) # pull patent data patent_data = data["patents"] # pull response information page_count = data["count"] total_patent_count = data["total_patent_count"] # initialize containers to hold data INVENTORS = {} EDGES_2MODE = set() # loop over patents to pull network data for d in patent_data: # extract data for each patent patent_number = d["patent_number"] patent_date = d["patent_date"] patent_title = d["patent_title"] inventors = d["inventors"] assignees = d["assignees"] cited_patents = d["cited_patents"] citedby_patents = d["citedby_patents"] # loop over inventors for inventor in inventors: # save inventor data if inventor["inventor_id"] not in INVENTORS: # get full name inventor_full_name = "%s %s" % ( inventor["inventor_first_name"], inventor["inventor_last_name"]) # get gender inventor_gender = None for inventor_first_name_token in inventor[ "inventor_first_name"].split(): if gender_detector.get_gender( inventor_first_name_token ) in ("male", "mostly_male", "female", "mostly_female") and inventor_gender is None: inventor_gender = gender_detector.get_gender( inventor_first_name_token).replace( "mostly_", "") if inventor_gender is None: inventor_gender = "UNKNOWN" assert inventor_gender in ("male", "female", "UNKNOWN") # add to dictionary INVENTORS[inventor["inventor_id"]] = { "inventor_first_name": inventor["inventor_first_name"], "inventor_last_name": inventor["inventor_last_name"], "inventor_full_name": inventor_full_name, "inventor_gender": inventor_gender } # save edge data EDGES_2MODE.add((inventor["inventor_id"], patent_number)) # create a bipartite graph in networkx B = nx.Graph() B.add_nodes_from([n[0] for n in EDGES_2MODE], bipartite=0) B.add_nodes_from([n[1] for n in EDGES_2MODE], bipartite=1) B.add_edges_from(EDGES_2MODE) # project the network to a unipartite representation G = bipartite.generic_weighted_projected_graph( B, [n[0] for n in EDGES_2MODE]) # add inventor attributes nx.set_node_attributes(G, INVENTORS) # set some graph attributes G.graph["assignee_id"] = university["assignee_id"] G.graph["name"] = university["name"] # get rid of node attributes we don't need for node in G.nodes: del G.nodes[node]["bipartite"] # impute gender randomly in proportion to distribution in the network nmale = len([ i for i in G.nodes.data("inventor_gender") if i[1] == "male" ]) * ["male"] nfemale = len([ i for i in G.nodes.data("inventor_gender") if i[1] == "female" ]) * ["female"] gender_distribution = nmale + nfemale for node in G.nodes(data=True): if G.nodes[node[0]]["inventor_gender"] == "UNKNOWN": G.nodes[node[0]]["inventor_gender"] = random.choice( gender_distribution) # export the graph path = os.path.join(os.path.realpath('.'), OUTPUT_FOLDER, "%s.graphml" % (G.graph["name"], )) nx.write_graphml(G, path) # print for node in G.nodes(data=True): print(G.graph["assignee_id"], G.graph["name"], node) assert node[1]["inventor_gender"] in ("male", "female")
users = set() items = set() for (u, v) in given_graph_edges: users.add(u) items.add(v) #---------------------------------------- # creating a bipartite graph (node attribute named “bipartite” with values 0 or 1 is to identify the sets each node belongs to) given_graph.add_nodes_from(list(users), bipartite=0) #set of 'users' nodes given_graph.add_nodes_from(list(items), bipartite=1) #set of 'items' nodes given_graph.add_edges_from(given_graph_edges) nx.is_bipartite(given_graph) #-------------- Projected-Item-Item-Graph ------------------- Prog_graph = bipartite.generic_weighted_projected_graph(given_graph, items) #----------------------------- Ground truth ------------- GT = defaultdict( list ) #dictionary, key=User_id, value=list of items recommended to that user g_t = list() #list of tuples, (user_id,item_id) with open(path1 + "/Part_2_1/dataset/Ground_Truth___UserID__ItemID.tsv") as f: for line in f: g_t.append(tuple(map(int, line.rstrip('\n').split('\t')))) for u, i in g_t: #user_id, item_id in (user_id,item_id) temp = set() if u in GT.keys(): temp = set(GT[u]) #set of items for user u in ground truth
nodes = pd.read_csv(options.input_filename, sep='\t', header=None) B = Graph() for row in nodes.iterrows(): B.add_node(row[1][0], bipartite=0) B.add_node(row[1][1], bipartite=1) B.add_edge(row[1][0], row[1][1]) top_nodes = set(n for n, d in B.nodes(data=True) if d['bipartite'] == 0) bottom_nodes = set(B) - top_nodes top = list(top_nodes) bottom = list(bottom_nodes) print "Generating network projection" if options.metric == "hypergeometric": G = bipartite.generic_weighted_projected_graph( B, top_nodes, weight_function=hypergeometric) #HYPERGEOMETRIC elif options.metric == "jaccard": G = bipartite.generic_weighted_projected_graph( B, top_nodes, weight_function=jaccard) #Jaccard elif options.metric == "PCC": G = bipartite.generic_weighted_projected_graph( B, top_nodes, weight_function=pcc_weight) #PCC elif options.metric == "simpson": G = bipartite.overlap_weighted_projected_graph(B, top_nodes, jaccard=False) #Simpson write_weighted_edgelist(G, options.output_filename, delimiter="\t") print "Execution finished"
def get_reviewer_recommendation(repo_name, access_token, open_pr_id=None, similarity_threshold=0.2, limit_pr=None, limit_recomm=5): # Get the access to Github API client = Github(access_token, per_page=300) print("[✔️] Connected to Github API.") # Get the repository object from Github API repo = client.get_repo(repo_name) # Get the maintainer of the repo repo_maintainer = repo.full_name.split("/")[0] # Get the list of closed PRS open_prs = list(repo.get_pulls(state='open', sort='created')) if len(open_prs) == 0: raise Exception( "Insufficient number of open pull requests. Use different repository." ) # Get the first open PR open_pr = open_prs[0] # If Id is provided in function, choose this one if open_pr_id != None: for pr in open_prs: if open_pr_id == pr.number: open_pr = pr if open_pr_id != None and open_pr_id != open_pr.number: raise Exception("Open PR not found. Change Open PR ID.") print("[✔️] Using PR ID #", open_pr.number) # Get all the closed pull requests closed_prs = list(repo.get_pulls(state='closed')) if len(closed_prs) < 1: raise Exception( "Insufficient number of closed pull requests. Use different repository." ) # Limit number of pull requests if limit_pr is set if limit_pr != None and limit_pr < len(closed_prs): closed_prs = closed_prs[:limit_pr] print("[✔️] Parsed closed PRs.") # Initialize a graph graphz = nx.Graph() # It inserts all the reviewers node we add to graph closed_prs_reviewers = [] # Save the data loaded from API for future use closed_prs_meta = [] # Iterate through all the closed pull requests for pr in closed_prs: # If PR doesnt have comments continue with next if pr.get_issue_comments().totalCount == 0: continue # Get the user who submitted this PR pull_requester = pr.user.login # Get the PR number pr_number = 'PR #' + str(pr.number) # Insert PR into graph node graphz.add_node(pr_number, type='Pull Request', bipartite=0) # Get all the comments of the PR comments = pr.get_issue_comments() # Get the meta data from PR and insert in closed_prs_meta pr_data = {} pr_data['id'] = pr_number pr_data['title'] = pr.title pr_data['body'] = pr.body pr_data['comments'] = comments closed_prs_meta.append(pr_data) # Iterate through all the comments for comment in comments: # Exclude user who are bots, maintainer, or PR submitter if comment.user != None and 'bot' not in comment.user.login and repo_maintainer != comment.user.login and pull_requester != comment.user.login: # Get the reviewer from comment reviewer = comment.user.login # Insert reviewer into graph node and closed_prs_reviewers list if reviewer not in closed_prs_reviewers: closed_prs_reviewers.append(reviewer) graphz.add_node(reviewer, type='user', bipartite=1) # If there is occurence of multiple comment, then add the occurence to the edge weight if graphz.has_edge(reviewer, pr_number): # Increment weight of edge new_weight = graphz.get_edge_data(reviewer, pr_number)['weight'] + 1 graphz[reviewer][pr_number]['weight'] = new_weight else: # Add edge with weight 1 graphz.add_edge(reviewer, pr_number, weight=1, type='reviews') print("[✔️] Built a bipartite graph.") # Generate document corpus for closed pull requests closed_prs_corpus = {} for pr in closed_prs_meta: title = str(pr['title']) body = str(pr['body']) doc = title + " " + body for comment in pr['comments']: doc += comment.body # Remove the code, mentions and URLS doc = re.sub('`.*`', '', doc) doc = re.sub(r"(?:\@|#|https?\://)\S+", "", doc) # insert document into corpus with index of corpus id closed_prs_corpus[pr['id']] = doc print("[✔️] Closed PRs corpus generated.") # Get corpus document for open PR open_pr_corpus = str(open_pr.title) + "\n" + str(open_pr.body) for comment in open_pr.get_issue_comments(): open_pr_corpus += comment.body # Remove the code, mentions and URLS open_pr_corpus = re.sub('`.*`', '', open_pr_corpus) open_pr_corpus = re.sub(r"(?:\@|#|https?\://)\S+", "", open_pr_corpus) print("[✔️] Open PR corpus generated.") # Get the open PR submitter open_pr_requester = open_pr.user.login # Get the actual reviewers of open PR open_pr_reviewers = [] for comment in open_pr.get_issue_comments(): reviewer = comment.user.login # Exclude bot, maintainer and PR submitter if open_pr_requester != reviewer and reviewer not in open_pr_reviewers and 'bot' not in reviewer and repo_maintainer != reviewer: open_pr_reviewers.append(reviewer) # Remove the open PR reviewers that are not in our graph for open_pr_rv in open_pr_reviewers: if open_pr_rv not in closed_prs_reviewers: open_pr_reviewers.remove(open_pr_rv) # Get the similarity matrix between all the closed PRs and open PR similarity_matrix = lda_cosine_sim(closed_prs_meta, closed_prs_corpus, open_pr_corpus) print("[✔️] Calculated cosine similarity.") # Sort the similarity matrix in reverse order similarity_matrix = sorted(similarity_matrix, reverse=True) # Get all similarity matrix filtered with threshold top_similarity_matrix = {} for i, pr in enumerate(closed_prs_meta): top_similarity_matrix[pr['id']] = similarity_matrix[i] # Get top similarity matrix using similarity threshold value top_sim_length = int(len(top_similarity_matrix) * similarity_threshold) top_similarity_matrix = dict( itertools.islice(top_similarity_matrix.items(), top_sim_length)) print("[✔️] Selected top ", similarity_threshold * 100, "% PRs using similarity threshold.") # Copy the bipartite graph into new one copied_barpartite_graphz = graphz.copy() # Get the top PR from similarity rank pr_nodes = [] for similarity_id in top_similarity_matrix: pr_nodes.append(similarity_id) # Remove PR nodes other than top selected PR nodes for node in list(copied_barpartite_graphz.nodes): if 'PR #' in node and node not in pr_nodes and copied_barpartite_graphz.has_node( node): copied_barpartite_graphz.remove_node(node) # Insert similarity scores in PR nodes for further use in custom weight for node in copied_barpartite_graphz.nodes: if node in pr_nodes: copied_barpartite_graphz.nodes[node][ 'similarity'] = top_similarity_matrix[node] print("[✔️] Generated subgraph.") # Initialize a projected graph projected_graphz = nx.Graph() # Project the copied bipartate graph into reviewers graph considering the weights projected_graphz = bipartite.generic_weighted_projected_graph( copied_barpartite_graphz, closed_prs_reviewers, weight_function=custom_weight) # Remove isolatated nodes from the projected graph for node in list(nx.isolates(projected_graphz)): projected_graphz.remove_node(node) if len(projected_graphz.nodes) == 0: raise Exception("Use more similarity threshold.") print("[✔️] Subgraph projected into reviewer's graph.") # Run page rank algorithm in projected graph pagerank = nx.pagerank(projected_graphz, alpha=0.85, personalization=None, max_iter=100, tol=1e-06, nstart=None, weight='weight', dangling=None) print("[✔️] Page rank calculated.") # Sort the page rank result by score pagerank = list(sorted(pagerank.items(), reverse=True, key=lambda x: x[1])) # Get only users from page rank result pagerank_reviewers = [pg[0] for pg in pagerank] # If there is recommendation limitation, limit it if limit_recomm != None: pagerank_reviewers = pagerank_reviewers[:limit_recomm] print("[✔️] Success.") # Print the current reviewers print("Current reviewers", open_pr_reviewers) # Print the recommended reviewers print("Recommended reviewers", pagerank_reviewers)
def get_projected_graph(graph, items): G = bipartite.generic_weighted_projected_graph(graph, set(items)) return G
# In[14]: with open("User_Item_BIPARTITE_GRAPH___UserID__ItemID.tsv") as tsvfile: tsvreader = csv.reader(tsvfile, delimiter="\t") for line in tsvreader: B.add_nodes_from([''.join(line[:1])], bipartite=0) B.add_nodes_from([''.join(line[1:])], bipartite=1) B.add_edge(''.join(line[:1]), ''.join(line[1:])) #print (int(''.join(line[:1])))#user # print (int(''.join(line[1:])))#item #print(''.join(line[:1]),''.join(line[1:])) #user-item # In[15]: G = bipartite.generic_weighted_projected_graph( B, [n for n, d in B.nodes(data=True) if d['bipartite'] == 1]) # In[16]: M = nx.to_scipy_sparse_matrix(G, nodelist=G.nodes(), weight='weight', dtype=float) # In[17]: Mnorm = csr_matrix(M.T / M.sum(axis=1).T) # In[275]: