def drop_edge(graph): edges = graph.edges indices = np.random.permutation(len(edges)) bound = int(len(edges) * 0.8) training_idx, test_idx = indices[:bound], indices[bound:] test_edges = np.array(edges)[test_idx].tolist() # non_zero = [(u,v,k) for u,v,k in edges if k != 0] # self_loop = [(u,v) for u,v,k in edges if u == v] # pdb.set_trace() for u, v in test_edges: graph.remove_edge(u, v) pred_adamic = list(link_prediction.adamic_adar_index(graph)) unpre_edges = [(u, v) for u, v, p in pred_adamic] score_adamic = [p for u, v, p in pred_adamic] pred_jaccard = list(link_prediction.jaccard_coefficient(graph)) score_jaccard = [p for u, v, p in pred_jaccard] label = [1 if [u, v] in test_edges else 0 for u, v in unpre_edges] adamic_results = roc_auc_score(label, score_adamic) jaccard_results = roc_auc_score(label, score_jaccard) return adamic_results, jaccard_results
def jaccard_sims(g, bipartite_mode, pairs): ''' return a generator that yields tuples of form (label1,label2,sim) for all similarities in the given node pairs :param g: the artists tags graph :param bipartite_mode: which set of nodes to calculate similarity for: ARTIST_MODE or TAG_MODE :param pairs: tuple of pairs of artist or tag nodes (an ebunch in networkx) ''' if bipartite_mode not in [ARTIST_MODE, TAG_MODE]: logging.error('invalid value for bipartite mode: %d' % bipartite_mode) return for counter,(a1, a2) in enumerate(pairs): sim_iter = jaccard_coefficient(g, [(a1, a2)]) (u, v, sim) = sim_iter.next() yield(u[1], v[1], sim) if (counter % 10000 == 0): logging.info('Calculated similarity for pair %d, mode %d' % (counter, bipartite_mode))
def jaccard_sims(g, bipartite_mode, pairs): ''' return a generator that yields tuples of form (label1,label2,sim) for all similarities in the given node pairs :param g: the artists tags graph :param bipartite_mode: which set of nodes to calculate similarity for: ARTIST_MODE or TAG_MODE :param pairs: tuple of pairs of artist or tag nodes (an ebunch in networkx) ''' if bipartite_mode not in [ARTIST_MODE, TAG_MODE]: logging.error('invalid value for bipartite mode: %d' % bipartite_mode) return for counter, (a1, a2) in enumerate(pairs): sim_iter = jaccard_coefficient(g, [(a1, a2)]) (u, v, sim) = sim_iter.next() yield (u[1], v[1], sim) if (counter % 10000 == 0): logging.info('Calculated similarity for pair %d, mode %d' % (counter, bipartite_mode))
def extract_features( edge_list: list, G: Graph, DiG: DiGraph, page_rank: dict, katz: dict, parameters: dict, ) -> pd.DataFrame: """Extracts features for a list of edges on an undirected graph. Args: edges: a list of edges. G: a NetworkX undirected graph. DiG: a NetworkX directed graph. page_rank: dictionary containing page_rank measures. katz: dictionary containing katz centrality measures. parameters: parameters defined in parameters.yml. Returns: Pandas dataframe with edge features. """ # Initialise logger and progress bar log = logging.getLogger(__name__) tqdm.pandas() # DEBUG ONLY: calculate features for subset subset = parameters["features"]["subset"] if subset: edges = edge_list[:subset] log.warning( red("Calculating features on first {} edges.".format(subset))) # pause() else: edges = edge_list log.warning( red("Calculating features on all {} edges.".format(len(edges)))) # pause() # Calculate edge features try: # Initialise feature matrix log.info(blue("Initialising feature matrix...")) df = pd.DataFrame(dict(edge=edges)) # Degree features log.info(blue("Calculating degree features...")) df = add_degree_features(DiG, df) # Undirected similarity log.info(blue("Calculating undirected similarity...")) df["RA_undirected"] = [ x for u, v, x in lp.resource_allocation_index(G, df.edge) ] df["JC_undirected"] = [ x for u, v, x in lp.jaccard_coefficient(G, df.edge) ] df["AA_undirected"] = [ x for u, v, x in lp.adamic_adar_index(G, df.edge) ] df["PA_undirected"] = [ x for u, v, x in lp.preferential_attachment(G, df.edge) ] # Shortest path log.info(blue("Calculating shortest path...")) df["shortest_path"] = df.edge.progress_apply(shortest_path, G=DiG) # Assortativity log.info(blue("Calculating average neighbor degree...")) df["source_avg_nbr_degree"] = df.edge.progress_apply( source_avg_nbr_degree, G=DiG) df["sink_avg_nbr_degree"] = df.edge.progress_apply(sink_avg_nbr_degree, G=DiG) # Boundary size log.info(blue("Calculating boundary size...")) df["node_boundary_size"] = df.edge.progress_apply(node_boundary_size, G=DiG) # Centrality log.info(blue("Calculating centrality...")) centrality = degree_centrality(DiG) df["source_centrality"] = df.edge.progress_apply( lambda e: centrality[e[0]]) df["sink_centrality"] = df.edge.progress_apply( lambda e: centrality[e[1]]) log.info(blue("Calculating in-degree centrality...")) in_centrality = in_degree_centrality(DiG) df["source_in_centrality"] = df.edge.progress_apply( lambda e: in_centrality[e[0]]) df["sink_in_centrality"] = df.edge.progress_apply( lambda e: in_centrality[e[1]]) log.info(blue("Calculating out-degree centrality...")) out_centrality = out_degree_centrality(DiG) df["source_out_centrality"] = df.edge.progress_apply( lambda e: out_centrality[e[0]]) df["sink_out_centrality"] = df.edge.progress_apply( lambda e: out_centrality[e[1]]) log.info(blue("Calculating Katz centrality...")) df["source_katz"] = df.edge.progress_apply(source_katz, katz=katz) df["sink_katz"] = df.edge.progress_apply(sink_katz, katz=katz) # Clustering # log.info(blue("Calculating source clustering...")) # X_train["source_clustering"] = X_train.edge.progress_apply( # source_clustering, G=DiG # ) # X_valid["source_clustering"] = X_valid.edge.progress_apply( # source_clustering, G=DiG # ) # X_test["source_clustering"] = X_test.edge.progress_apply( # source_clustering, G=DiG # ) # log.info(blue("Calculating sink clustering...")) # df["sink_clustering"] = df.edge.progress_apply(sink_clustering, G=DiG) # PageRank log.info(blue("Calculating PageRank...")) df["source_page_rank"] = df.edge.progress_apply(source_page_rank, page_rank=page_rank) df["sink_page_rank"] = df.edge.progress_apply(sink_page_rank, page_rank=page_rank) # Efficiency log.info(blue("Calculating link efficiency...")) df["link_efficiency"] = df.edge.progress_apply(link_efficiency, G=G) # Reciprocity log.info(blue("Calculating reciprocity metrics...")) df["is_followed_back"] = df.edge.progress_apply(is_followed_back, G=DiG) df["source_reciprocity"] = df.edge.progress_apply(source_reciprocity, G=DiG) df["sink_reciprocity"] = df.edge.progress_apply(sink_reciprocity, G=DiG) # # TOO SLOW # # Connectivity # log.info(blue("Calculating connectivity...")) # C = parameters["features"]["connectivity"]["cutoff"] # X_train["edge_connectivity"] = X_train.edge.progress_apply( # connectivity, G=DiG, cutoff=C # ) # X_valid["edge_connectivity"] = X_valid.edge.progress_apply( # connectivity, G=DiG, cutoff=C # ) # X_test["edge_connectivity"] = X_test.edge.progress_apply( # connectivity, G=DiG, cutoff=C # ) # # Dispersion # log.info(blue("Calculating link dispersion...")) # X_train["link_dispersion"] = X_train.edge.progress_apply(link_dispersion, G=G) # X_valid["link_dispersion"] = X_valid.edge.progress_apply(link_dispersion, G=G) # X_test["link_dispersion"] = X_test.edge.progress_apply(link_dispersion, G=G) # Remove edge column df = df.drop("edge", axis=1) except: del df gc.collect() raise return df
from ptsplitter.utils import positive_edges, negative_edges, iter_get_scores_networkx print("Reading in dataset.") G = max( nx.connected_component_subgraphs(nx.read_edgelist("data_input/CA-AstroPh.txt")), key=len, ) sample_number = G.number_of_edges() // 2 G_original = nx.Graph(G) positive_samples = list(take(sample_number, positive_edges(G))) negative_samples = list(take(sample_number, negative_edges(G))) G.remove_edges_from(positive_samples) positive_scores_non_persona = list( map(nth(2), jaccard_coefficient(G, positive_samples)) ) negative_scores_non_persona = list( map(nth(2), jaccard_coefficient(G, negative_samples)) ) print(sum(positive_scores_non_persona)) print(sum(negative_scores_non_persona)) print( roc_auc_score( [1] * len(positive_samples) + [0] * len(negative_samples), positive_scores_non_persona + negative_scores_non_persona, ) )
print('sample negative edges') #sample negative edges #G.add_edges_from(target_test_edges) target_neg_edges = sample_negative_edges(G, target_test_edges, 1) print(len(target_neg_edges)) print(len(target_test_edges)) G.remove_edges_from(target_test_edges) print('generate the models') #calculate the scores for all testing edges testing_tuples = [ cn_soundarajan_hopcroft(G, target_test_edges, 'node_type'), ra_index_soundarajan_hopcroft(G, target_test_edges, 'node_type'), adamic_adar_index(G, target_test_edges), resource_allocation_index(G, target_test_edges), jaccard_coefficient(G, target_test_edges), preferential_attachment(G, target_test_edges) ] #testing_tuples = [resource_allocation_index(G,test_edges[0:1000])] #calculate the scores for all non-existing edges neg_tuples = [ cn_soundarajan_hopcroft(G, target_neg_edges, 'node_type'), ra_index_soundarajan_hopcroft(G, target_neg_edges, 'node_type'), adamic_adar_index(G, target_neg_edges), resource_allocation_index(G, target_neg_edges), jaccard_coefficient(G, target_neg_edges), preferential_attachment(G, target_neg_edges) ] #neg_tuples = [resource_allocation_index(G,neg_edges)]
G = nx.compose_all(graphs) print('sample negative edges') #sample negative edges start = time.time() neg_edges = sample_negative_edges(G, test_edges[0:1000]) end = time.time() print(end-start) print(len(neg_edges)) print(len(test_edges)) G.remove_edges_from(test_edges) print('generate the models') #calculate the scores for all testing edges testing_tuples = [resource_allocation_index(G,test_edges[0:1000]), jaccard_coefficient(G,test_edges[0:1000]), \ preferential_attachment(G,test_edges[0:1000])] #testing_tuples = [resource_allocation_index(G,test_edges[0:1000])] #calculate the scores for all non-existing edges neg_tuples = [resource_allocation_index(G,neg_edges), jaccard_coefficient(G,neg_edges), \ preferential_attachment(G,neg_edges)] #neg_tuples = [resource_allocation_index(G,neg_edges)] #list of methods models = ['resource_allocation_index', 'jaccard_coefficient', 'preferential_attachment'] #models = ['resource_allocation_index'] fout = open('baseline_performance.txt','w') fout.write('Method\tAverage Precision Score\tAUROC\tAUPR\n')