def common_neighbors(G, fn, t = 0.5):
    G = G.to_undirected()
    if os.path.isfile(fn) :
        H = G.copy()
        found = nx.read_edgelist(fn, nodetype=int, data=False)
        H.add_edges_from(found.edges_iter())
        jacc_iter = nx.jaccard_coefficient(G, nx.non_edges(H))
        print "Appending to %s" % fn
        outfile = open(fn,'a',1)
        i = found.number_of_nodes()
    else:
        jacc_iter = nx.jaccard_coefficient(G)
        outfile = open(fn,'w',1)
        i = 0
    outfile.write("#vertex u; vertex v; their jaccard coef\n")
    cur = -1
    print "Starting jacc loop %s with threshold %s" % (time.strftime("%H:%M:%S"), t)
    for pair in jacc_iter:
        if pair[2] >= t:
            outfile.write("%s %s %f\n" % (pair[0],pair[1],pair[2]))
            if pair[0] != cur:
                cur = pair[0]
                i += 1
                print "%s: %s" % (i, cur)
    outfile.close()
    print "Done writing %s" % (fn)
def get_link_pred_auc(graph, pos_test, neg_test):

    jc_pos_test_pred = nx.jaccard_coefficient(graph, pos_test)
    jc_neg_test_pred = nx.jaccard_coefficient(graph, neg_test)

    jc_pos_score = [p for _, _, p in jc_pos_test_pred]
    jc_neg_score = [n for _, _, n in jc_neg_test_pred]

    jc_all_labels = [1] * len(jc_pos_score) + [0] * len(jc_neg_score)
    jc_all_scores = jc_pos_score + jc_neg_score

    jc_auc = metrics.roc_auc_score(jc_all_labels, jc_all_scores)

    aa_pos_test_pred = nx.resource_allocation_index(graph, pos_test)
    aa_neg_test_pred = nx.resource_allocation_index(graph, neg_test)

    aa_pos_score = [p for _, _, p in aa_pos_test_pred]
    aa_neg_score = [n for _, _, n in aa_neg_test_pred]

    aa_all_labels = [1] * len(aa_pos_score) + [0] * len(aa_neg_score)
    aa_all_scores = aa_pos_score + aa_neg_score

    aa_auc = metrics.roc_auc_score(aa_all_labels, aa_all_scores)

    return jc_auc, aa_auc
Beispiel #3
0
def compute_indexes(G: nx.Graph, method, negative, positive):
    if method == 'resource_allocation':
        return nx.resource_allocation_index(
            G, negative), nx.resource_allocation_index(G, positive)
    elif method == 'jaccard_coefficient':
        return nx.jaccard_coefficient(G, negative), nx.jaccard_coefficient(
            G, positive)
    elif method == 'adamic_adar':
        return nx.adamic_adar_index(G, negative), nx.adamic_adar_index(
            G, positive)
    elif method == 'preferential_attachment':
        return nx.preferential_attachment(
            G, negative), nx.preferential_attachment(G, positive)
    elif method == 'sorensen_neighbours':
        return ([(u, v, sorensen_index(G, u, v)) for u, v in negative],
                [(u, v, sorensen_index(G, u, v)) for u, v in positive])
    elif method == 'community':
        c = louvain(G)
        commLabels = c.communities
        comms = c.to_node_community_map()
        return ([(u, v, community_index(G, u, v, commLabels, comms))
                 for u, v in negative],
                [(u, v, community_index(G, u, v, commLabels, comms))
                 for u, v in positive])
    else:
        raise NameError('The given method is not supported')
Beispiel #4
0
def main():
	
	G=read_edgelist('testdata/demo.net')
	M =  nx.to_numpy_matrix(G)
	print M.dot(M.T)
	print nx.jaccard_coefficient(G, ['a'])
	for u, v, p in nx.jaccard_coefficient(G, ['a']):
		print u, v, p
def jaccard_coefficient_scores(g_train, train_test_split):
    if g_train.is_directed(): # Jaccard coef only works for undirected graphs
        g_train = g_train.to_undirected()

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
        test_edges, test_edges_false = train_test_split # Unpack input

    start_time = time.time()
    jc_scores = {}

    # Calculate scores
    jc_matrix = np.zeros(adj_train.shape)
    for u, v, p in nx.jaccard_coefficient(g_train, ebunch=get_ebunch(train_test_split)): # (u, v) = node indices, p = Jaccard coefficient
        jc_matrix[u][v] = p
        jc_matrix[v][u] = p # make sure it's symmetric
    jc_matrix = jc_matrix / jc_matrix.max() # Normalize matrix

    runtime = time.time() - start_time
    jc_roc, jc_ap = get_roc_score(test_edges, test_edges_false, jc_matrix)

    jc_scores['test_roc'] = jc_roc
    # jc_scores['test_roc_curve'] = jc_roc_curve
    jc_scores['test_ap'] = jc_ap
    jc_scores['runtime'] = runtime
    return jc_scores
    def train(self, train_graph, test_edges):

        pos_samples, neg_samples = test_edges
        n = train_graph.number_of_nodes()

        coeff_matrix = np.zeros(shape=(n, n), dtype=np.float)

        samples = pos_samples + neg_samples

        preds = nx.jaccard_coefficient(train_graph, samples)
        for i, j, p in preds:
            coeff_matrix[int(i), int(j)] = p
            coeff_matrix[int(j), int(i)] = p

        coeff_matrix = coeff_matrix / coeff_matrix.max()

        ytrue = [1 for _ in range(len(pos_samples))
                 ] + [0 for _ in range(len(neg_samples))]
        y_score = [
            coeff_matrix[int(edge[0]), int(edge[1])] for edge in pos_samples
        ] + [coeff_matrix[int(edge[0]), int(edge[1])] for edge in neg_samples]

        auc = roc_auc_score(y_true=ytrue, y_score=y_score)

        print(auc)

        return auc
Beispiel #7
0
def main():
    G = nx.Graph()
    dfr = pd.read_csv('Followers.csv')
    #print(dfr.values)
    Followers = dfr['follows'].tolist()
    Followers_common = []
    c = Counter(Followers)
    for userid, count in c.most_common():
        if count > 1:
            Followers_common.append(userid)
    df_filtered = dfr.loc[dfr['follows'].isin(Followers_common)]
    for index, row in df_filtered.iterrows():
        G.add_edge(row['follows'], row['userids'])
    result = girvan_newman(G, 4)
    #print(len(result))
    count = []
    for r in result:
        count.append(len(r))
    average = sum(count) / len(result)
    prediction = nx.jaccard_coefficient(G)
    w_graph = nx.Graph()
    for u, v, p in prediction:
        #print('(%d, %d) -> %.8f' % (u, v, p))
        w_graph.add_edge(u, v, weight=p)
    nx.draw(
        w_graph,
        node_size=50,
        width=0.25,
        alpha=0.25,
    )
    plt.savefig('Graph_Weighted.png')
    with open("Communities.txt", 'w') as outfile:
        outfile.write("%d\n%.2f\n" % (len(result), float(average)))
    def extract_features(self, prediction_set=None):
        edge_features = defaultdict(dict)
        print '{0} | extract_features: res_alloc'.format(str(datetime.now()))
        res_alloc = nx.resource_allocation_index(self.G, ebunch=prediction_set)
        self.append_features(edge_features,
                             feature_name='res_alloc',
                             feature_list=res_alloc)

        print '{0} | extract_features: jaccard_coef'.format(str(
            datetime.now()))
        jaccard_coef = nx.jaccard_coefficient(self.G, ebunch=prediction_set)
        self.append_features(edge_features,
                             feature_name='jaccard_coef',
                             feature_list=jaccard_coef)

        print '{0} | extract_features: adamic_adar'.format(str(datetime.now()))
        adamic_adar = nx.adamic_adar_index(self.G, ebunch=prediction_set)
        self.append_features(edge_features,
                             feature_name='adamic_adar',
                             feature_list=adamic_adar)

        print '{0} | extract_features: pref_attachment'.format(
            str(datetime.now()))
        pref_attachment = nx.preferential_attachment(self.G,
                                                     ebunch=prediction_set)
        self.append_features(edge_features,
                             feature_name='pref_attachment',
                             feature_list=pref_attachment)

        # reformat feature dictionary to a dataframe object
        df, feature_names = self.feature_dict_to_df(edge_features)

        return df, feature_names
def retweet_similarity_network(G):
    V = list(G.nodes())
    print(f'{len(V)} nodes in retweet network')

    ebunch = []
    for counter, u in enumerate(V):
        for v in V[counter + 1:]:
            if (G.has_node(v)) and (G.has_node(u)):
                ebunch.append((u, v))
    preds = nx.jaccard_coefficient(G.to_undirected(), ebunch)
    print(len(ebunch), " node pairs to check Jaccard index")

    print(
        "Create similarity graph between nodes using Jacard coefficient based on retweets"
    )
    counter = 0
    Gsim = nx.Graph()
    ne = 0
    for u, v, s in preds:
        counter += 1
        if s > 0:
            Gsim.add_edge(u, v, weight=s)
            ne += 1
        if counter % 1e6 == 0: print(counter, ne, " positive weights")
    nv = Gsim.number_of_nodes()
    ne = Gsim.number_of_edges()
    print("Gsim has %s nodes, %s edges" % (nv, ne))
    return Gsim
def L_P_JC(network):
    num_add = 0  # the number of egdes to be added
    nodes_pair_without_edge = []  # the pairs of nodes without edges
    probability_add = []  # the probabilities of the pairs of nodes to be added
    score = 0  # the score of each pair of nodes in link prediction model
    total_score_without_edge = 0.0  # the sum of scores of pairs of nodes without edge

    #  calculate the score of each pair of nodes
    for i, elei in enumerate(list(network.nodes(), 1)):
        for j, elej in enumerate(list(network.nodes(), 1)):

            if i >= j:
                continue
            if not network.has_edge(elei, elej):
                try:
                    pre = nx.jaccard_coefficient(network, [(elei, elej)])
                    for u, v, s in pre:
                        score = s
                except:
                    continue
                total_score_without_edge += score
                nodes_pair_without_edge.append((elei, elej, score))

    for a, b, c in nodes_pair_without_edge:
        probability_add.append(
            c / total_score_without_edge
        )  # calculate the probabilities of edges to be added
    # select edges to be added according to probabilities
    edges_add = calculate_param.prob_select_distinct(nodes_pair_without_edge,
                                                     probability_add, num_add)
    for a, b, c in edges_add:
        network.add_edge(a, b)  # add selected edges

    return True
Beispiel #11
0
def networkx_call(M):

    sources = M['0']
    destinations = M['1']
    edges = []
    for i in range(len(M)):
        edges.append((sources[i], destinations[i]))
    edges = sorted(edges)
    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
    # explicitly
    print('Format conversion ... ')

    Gnx = nx.from_pandas_edgelist(M,
                                  source='0',
                                  target='1',
                                  edge_attr='weight',
                                  create_using=nx.Graph())
    # Networkx Jaccard Call
    print('Solving... ')
    t1 = time.time()
    preds = nx.jaccard_coefficient(Gnx, edges)
    t2 = time.time() - t1

    print('Time : ' + str(t2))
    src = []
    dst = []
    coeff = []
    for u, v, p in preds:
        src.append(u)
        dst.append(v)
        coeff.append(p)
    return src, dst, coeff
Beispiel #12
0
def networkx_call(M):

    sources = M["0"]
    destinations = M["1"]
    edges = []
    for i in range(len(sources)):
        edges.append((sources[i], destinations[i]))
        edges.append((destinations[i], sources[i]))
    edges = list(dict.fromkeys(edges))
    edges = sorted(edges)
    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
    # explicitly
    print("Format conversion ... ")

    # NetworkX graph
    Gnx = nx.from_pandas_edgelist(M,
                                  source="0",
                                  target="1",
                                  create_using=nx.Graph())
    # Networkx Jaccard Call
    print("Solving... ")
    t1 = time.time()
    preds = nx.jaccard_coefficient(Gnx, edges)
    t2 = time.time() - t1

    print("Time : " + str(t2))
    coeff = []
    for u, v, p in preds:
        coeff.append(p)
    return coeff
Beispiel #13
0
def networkx_call(M):

    M = M.tocsr()
    M = M.tocoo()
    sources = M.row
    destinations = M.col
    edges = []
    for i in range(len(sources)):
        edges.append((sources[i], destinations[i]))
    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
    # explicitly
    print('Format conversion ... ')

    # Directed NetworkX graph
    G = nx.DiGraph(M)
    Gnx = G.to_undirected()

    # Networkx Jaccard Call
    print('Solving... ')
    t1 = time.time()
    preds = nx.jaccard_coefficient(Gnx, edges)
    t2 = time.time() - t1

    print('Time : '+str(t2))
    src = []
    dst = []
    coeff = []
    for u, v, p in preds:
        src.append(u)
        dst.append(v)
        coeff.append(p)
    return src, dst, coeff
Beispiel #14
0
def networkx_call(M, benchmark_callable=None):

    sources = M["0"]
    destinations = M["1"]
    edges = []
    for i in range(len(M)):
        edges.append((sources[i], destinations[i]))
        edges.append((destinations[i], sources[i]))
    edges = list(dict.fromkeys(edges))
    edges = sorted(edges)
    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
    # explicitly
    print("Format conversion ... ")

    Gnx = nx.from_pandas_edgelist(
        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
    )

    # Networkx Jaccard Call
    print("Solving... ")
    if benchmark_callable is not None:
        preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges)
    else:
        preds = nx.jaccard_coefficient(Gnx, edges)

    src = []
    dst = []
    coeff = []
    for u, v, p in preds:
        src.append(u)
        dst.append(v)
        # Conversion from Networkx Jaccard to Sorensen
        # No networkX equivalent
        coeff.append((2*p)/(1+p))
    return src, dst, coeff
Beispiel #15
0
def get_test_features():
    features = []
    count = 0
    print("Generating test data features......")
    for temp_data in test_edges:
      if (count % 100 == 0):
        print(count)
      count += 1
      feature = []
      try:
        preds = nx.resource_allocation_index(G, [temp_data])
        for u, v, p in preds:
          feature.append(p)
        
        preds = nx.jaccard_coefficient(G, [temp_data])
        for u, v, p in preds:
          feature.append(p)
          
        
        
      except:
        print("one error at: "+str(count))
        pass
      
      features.append(feature)
    print("positive features: "+str(len(features)))
    return features
def new_connections_predictions():
    df = future_connections
    df['jaccard_coefficient'] = [
        x[2] for x in nx.jaccard_coefficient(G, df.index)
    ]
    df['resource_allocation_index'] = [
        x[2] for x in nx.resource_allocation_index(G, df.index)
    ]
    df['preferential_attachment'] = [
        x[2] for x in nx.preferential_attachment(G, df.index)
    ]
    df['common_neighbors'] = df.index.map(
        lambda ind: len(list(nx.common_neighbors(G, ind[0], ind[1]))))
    print('.......we have extracted all the features......')
    df_train = df[~pd.isnull(df['Future Connection'])]
    df_test = df[pd.isnull(df['Future Connection'])]
    features = [
        'jaccard_coefficient', 'resource_allocation_index',
        'preferential_attachment', 'common_neighbors'
    ]
    X_train = df_train[features]
    Y_train = df_train['Future Connection']
    X_test = df_test[features]
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = LogisticRegression(solver='liblinear', random_state=14)
    clf.fit(X_train_scaled, Y_train)
    predictions = np.round(clf.predict_proba(X_test_scaled)[:, 1], 2)
    results = pd.Series(data=predictions, index=X_test.index)
    results = results.sort_values(ascending=False)
    return results


# print (new_connections_predictions())
def link_prediction_with_metrics(subgraph, tuples, df):
    jaccard_coefficient_list = list(nx.jaccard_coefficient(subgraph, tuples))
    y_test = create_test_data(jaccard_coefficient_list)
    print(
        f"ROC AUC Score with Jaccard Coefficient: {roc_auc_score(df['link'], y_test)}\n"
        f"Average Precision with Jaccard Coefficient: {average_precision_score(df['link'], y_test)}"
    )

    adamic_adar_list = list(nx.adamic_adar_index(subgraph, tuples))
    y_test = create_test_data(adamic_adar_list)
    print(
        f"ROC AUC Score with Adamic Adar Index: {roc_auc_score(df['link'], y_test)}\n"
        f"Average Precision with Adamic Adar Index: {average_precision_score(df['link'], y_test)}"
    )

    preferential_attachment_list = list(
        nx.preferential_attachment(subgraph, tuples))
    y_test = create_test_data(preferential_attachment_list)
    print(
        f"ROC AUC Score with Preferential Attachment: {roc_auc_score(df['link'], y_test)}\n"
        f"Average Precision with Preferential Attachment: {average_precision_score(df['link'], y_test)}"
    )

    resource_allocation_list = list(
        nx.resource_allocation_index(subgraph, tuples))
    y_test = create_test_data(resource_allocation_list)
    print(
        f"ROC AUC Score with Resource Allocation Index: {roc_auc_score(df['link'], y_test)}\n"
        f"Average Precision with Resource Allocation Index: {average_precision_score(df['link'], y_test)}"
    )
 def jaccard_coefficient(graph, author_osn_id, labeled_author_osn_id):
     if not nx.is_directed(graph):
         pair = [(author_osn_id, labeled_author_osn_id)]
         jaccard_coefficient_iterator = nx.jaccard_coefficient(graph, pair)
         jaccard_coefficient_score = LinkPredictionStaticFunctions.get_score_from_iterator(jaccard_coefficient_iterator)
         return jaccard_coefficient_score
     return 0
Beispiel #19
0
    def create_features(self, G_train, edge_bunch):
        i = 0
        X = []
        page_rank = nx.pagerank_scipy(G_train)
        for pair in edge_bunch:
            commmon_neighbors = len(
                list(nx.common_neighbors(G_train, pair[0], pair[1])))
            jaccard_coefficient = nx.jaccard_coefficient(G_train,
                                                         [pair]).next()[2]
            adamic_adar = nx.adamic_adar_index(G_train, [pair]).next()[2]
            degree_0 = nx.degree(G_train, pair[0])
            degree_1 = nx.degree(G_train, pair[1])
            prod = degree_0 * degree_1
            page_rank_0 = page_rank[pair[0]]
            page_rank_1 = page_rank[pair[1]]

            f = [
                degree_0,
                degree_1,
                prod,
                commmon_neighbors,
                jaccard_coefficient,
                adamic_adar,
                page_rank_0,
                page_rank_1,
            ]

            X.append(f)

            i += 1
            if i % 1000000 == 0:
                print(i)

        return np.array(X)
Beispiel #20
0
def networkx_call(M, benchmark_callable=None):

    sources = M["0"]
    destinations = M["1"]
    edges = []
    for i in range(len(sources)):
        edges.append((sources[i], destinations[i]))
        edges.append((destinations[i], sources[i]))
    edges = list(dict.fromkeys(edges))
    edges = sorted(edges)
    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
    # explicitly
    print("Format conversion ... ")

    # NetworkX graph
    Gnx = nx.from_pandas_edgelist(M,
                                  source="0",
                                  target="1",
                                  create_using=nx.Graph())
    # Networkx Jaccard Call
    print("Solving... ")
    if benchmark_callable is not None:
        preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges)
    else:
        preds = nx.jaccard_coefficient(Gnx, edges)
    coeff = []
    for u, v, p in preds:
        # FIXME: Use known correct values of WSorensen for few graphs,
        # hardcode it and compare to Cugraph WSorensen
        # to get a more robust test

        # Conversion from Networkx Jaccard to Sorensen
        coeff.append((2 * p) / (1 + p))
    return coeff
Beispiel #21
0
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file):
    gc.collect()

    rmm.reinitialize(
        managed_memory=managed,
        pool_allocator=pool,
        initial_pool_size=2 << 27
    )

    assert(rmm.is_initialized())

    M = utils.read_csv_for_nx(graph_file)
    M = M.tocsr()
    Gnx = nx.DiGraph(M).to_undirected()
    G = cugraph.Graph()
    row_offsets = cudf.Series(M.indptr)
    col_indices = cudf.Series(M.indices)
    values = cudf.Series(M.data)
    G.from_cudf_adjlist(row_offsets, col_indices, values)
    pairs = G.get_two_hop_neighbors()
    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs['first'][i], pairs['second'][i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs['first'], pairs['second'])
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df['jaccard_coeff'][i])
        assert diff < 1.0e-6
Beispiel #22
0
def create_related_gene_network(gene_network_nx, threshold, genes):
    rgn = nx.Graph()

    for g1, g2 in gene_network_nx.edges:
        if g1 in genes and g2 in genes:
            rgn.add_edge(g1, g2, weight=1.0)

    similarity = nx.jaccard_coefficient(gene_network_nx)
    for g1, g2, coefficient in similarity:
        #print('({}, {}) -> {:.4f}'.format(g1, g2, coefficient))
        # adding in RGN only edges with weight greater than threshold and if genes are in the BMM
        if coefficient > threshold and g1 in genes and g2 in genes:
            rgn.add_edge(g1, g2, weight=coefficient)

    ## routine to print the graph
    #pos = nx.circular_layout(rgn)  # positions for all nodes
    ## nodes
    #nx.draw_networkx_nodes(rgn, pos, node_size=300)
    ## edges
    #nx.draw_networkx_edges(rgn, pos, width=1)
    #labels = nx.get_edge_attributes(rgn,'weight')
    #nx.draw_networkx_edge_labels(rgn, pos, font_size=8,edge_labels=labels)
    ## labels
    #nx.draw_networkx_labels(rgn, pos, font_size=10)

    #plt.axis('off')
    #plt.show()

    return rgn
Beispiel #23
0
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file):
    gc.collect()

    rmm.reinitialize(managed_memory=managed,
                     pool_allocator=pool,
                     initial_pool_size=2 << 27)

    assert (rmm.is_initialized())

    M = utils.read_csv_for_nx(graph_file)
    cu_M = utils.read_csv_file(graph_file)

    Gnx = nx.from_pandas_edgelist(M,
                                  source='0',
                                  target='1',
                                  edge_attr='weight',
                                  create_using=nx.Graph())
    G = cugraph.Graph()
    G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2')
    pairs = G.get_two_hop_neighbors()
    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs['first'][i], pairs['second'][i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs)
    df = df.sort_values(by=['source', 'destination'])
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df['jaccard_coeff'][i])
        assert diff < 1.0e-6
Beispiel #24
0
def test_jaccard_two_hop(managed, pool, graph_file):
    gc.collect()

    rmm.finalize()
    rmm_cfg.use_managed_memory = managed
    rmm_cfg.use_pool_allocator = pool
    rmm.initialize()

    assert (rmm.is_initialized())

    M = read_mtx_file(graph_file)
    M = M.tocsr()
    Gnx = nx.DiGraph(M).to_undirected()
    G = cugraph.Graph()
    row_offsets = cudf.Series(M.indptr)
    col_indices = cudf.Series(M.indices)
    G.add_adj_list(row_offsets, col_indices, None)
    pairs = G.get_two_hop_neighbors()
    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs['first'][i], pairs['second'][i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs['first'], pairs['second'])
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df['jaccard_coeff'][i])
        assert diff < 1.0e-6
Beispiel #25
0
def compare_sorensen_two_hop(G, Gnx):
    """
    Compute both cugraph and nx sorensen after extracting the two hop neighbors
    from G and compare both results
    """
    pairs = (
        G.get_two_hop_neighbors()
        .sort_values(["first", "second"])
        .reset_index(drop=True)
    )
    nx_pairs = []
    nx_pairs = list(pairs.to_records(index=False))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        # FIXME: Use known correct values of Sorensen for few graphs,
        # hardcode it and compare to Cugraph Sorensen to get a more robust test

        # Conversion from Networkx Jaccard to Sorensen
        # No networkX equivalent
        nx_coeff.append((2*p)/(1+p))
    df = cugraph.sorensen(G, pairs)
    df = df.sort_values(by=["source", "destination"]).reset_index(drop=True)
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df["sorensen_coeff"].iloc[i])
        assert diff < 1.0e-6
Beispiel #26
0
def edgefeat(g, norm=False, fil='ricci'):
    """
    wrapper for edge_probability and ricciCurvature computation
    :param g: graph
    :param fil:  edge_p/ricci/jaccard
    :param whether normalize edge values or not
    :return: gp, a dense numpy array of shape (n_node, n_node)
    """
    g = nx.convert_node_labels_to_integers(g)
    assert nx.is_connected(g)
    adj_m = nx.adj_matrix(g).todense()  # dense matrix
    gp = np.zeros((len(g), len(g)))
    try:
        if fil == 'edge_p':
            gp = np.array(smoother(adj_m, h=0.3))
            gp = np.multiply(adj_m, gp)
        elif fil == 'ricci':
            g = ricciCurvature(g, alpha=0.5, weight='weight')
            ricci_dict = nx.get_edge_attributes(g, 'ricciCurvature')
            for u, v in ricci_dict.keys():
                gp[u][v] = ricci_dict[(u, v)]
            gp += gp.T
        elif fil == 'jaccard':
            jac_list = nx.jaccard_coefficient(g, g.edges(
            ))  # important since jaccard can also be defined on non edge
            for u, v, jac in jac_list:
                gp[u][v] = jac
            gp += gp.T
    except AssertionError:
        print('Have not implemented fil %s. Treat as all zeros' % fil)
        gp = np.zeros((len(g), len(g)))
    assert (gp == gp.T).all()
    if norm: gp = gp / float(max(abs(gp)))
    return gp
Beispiel #27
0
def SimilarityMeasures(G):

    # resource_allocation_index
    preds = nx.resource_allocation_index(G, [(1, 2), (3, 4), (1, 4), (5, 6),
                                             (3, 5)])
    for u, v, p in preds:
        print('(%d, %d) -> %.8f' % (u, v, p))

    print('****************************')

    # Common neighours
    print(sorted(nx.common_neighbors(G, 1, 2)))
    print('****************************')

    # jaccard coefficient
    preds = nx.jaccard_coefficient(G, [(1, 2), (3, 4), (1, 4), (5, 6), (3, 5)])
    for u, v, p in preds:
        print('(%d, %d) -> %.8f' % (u, v, p))

    print('****************************')

    # AdamicAdar
    preds = nx.adamic_adar_index(G, [(1, 2), (3, 4), (1, 4), (5, 6), (3, 5)])
    for u, v, p in preds:
        print('(%d, %d) -> %.8f' % (u, v, p))

    print('****************************')

    # Preferential Attachment (PA),
    preds = nx.preferential_attachment(G, [(1, 2), (3, 4), (1, 4), (5, 6),
                                           (3, 5)])
    for u, v, p in preds:
        print('(%d, %d) -> %.8f' % (u, v, p))

    print('****************************')
Beispiel #28
0
def networkx_call(M, benchmark_callable=None):

    sources = M["0"]
    destinations = M["1"]
    edges = []
    for i in range(len(sources)):
        edges.append((sources[i], destinations[i]))
        edges.append((destinations[i], sources[i]))
    edges = list(dict.fromkeys(edges))
    edges = sorted(edges)
    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
    # explicitly
    print("Format conversion ... ")

    # NetworkX graph
    Gnx = nx.from_pandas_edgelist(M,
                                  source="0",
                                  target="1",
                                  create_using=nx.Graph())
    # Networkx Jaccard Call
    print("Solving... ")
    if benchmark_callable is not None:
        preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges)
    else:
        preds = nx.jaccard_coefficient(Gnx, edges)

    coeff = []
    for u, v, p in preds:
        coeff.append(p)
    return coeff
def feature_calculate(g, data, column_names, save_to):
    pairs = list(map(lambda x: (x[0], x[1]), data))
    jaccard = nx.jaccard_coefficient(g, pairs)
    preferential = nx.preferential_attachment(g, pairs)
    rai = nx.resource_allocation_index(g, pairs)

    # shortest path
    total = len(data)
    current = 0
    for row_data in zip(data, jaccard, preferential, rai):
        row = row_data[0]
        try:
            thisjaccard = row_data[1][2]
        except:
            thisjaccard = -1

        try:
            thispreferential = row_data[2][2]
        except:
            thispreferential = -1

        try:
            thisrai = row_data[3][2]
        except:
            thisrai = -1

        # pred = row_data[1]
        # resource_allocation_index = pred[2]
        if current % 1000 == 0:
            ut.log("calculating {}/{}...".format(current, total))
        path_length = 99999
        try:
            path = nx.shortest_path(g, row[0], row[1], 'weight')
            path_length = len(path)
        except:
            pass

        # shortest path
        row.insert(-1, path_length)

        # jaccard
        row.insert(-1, thisjaccard)

        # preferential
        row.insert(-1, thispreferential)

        # rai
        row.insert(-1, thisrai)

        current += 1

    original_columns_titles = list(column_names)
    original_columns_titles.insert(-1, "shortest_path_count")
    original_columns_titles.insert(-1, "jaccard")
    original_columns_titles.insert(-1, "preferential")
    original_columns_titles.insert(-1, "rai")

    data.insert(0, original_columns_titles)

    ut.write_list_csv(save_to, data)
def jaccard_cancel_list(nodes, G, effort):
    print("\n \t Calculating jaccard coefficents", end="")

    Removed_Edge = list()
    global jaccard_list
    max_number_of_edges = 0
    if (len(jaccard_list) > 1):
        if effort != 0:
            max_number_of_edges = int(len(jaccard_list) * (effort / 100)) - 1
            Removed_Edge = jaccard_list[0:max_number_of_edges]
    else:
        for node in G:
            successors = G.successors(node)
            for successor in successors:
                jaccard = nx.jaccard_coefficient(nx.Graph(G),
                                                 [(node, successor)])
                for node1, node2, jacc_coff in jaccard:
                    if jacc_coff < 0.4:
                        jaccard_list.append((node, successor))

        if effort != 0:
            max_number_of_edges = int(len(jaccard_list) * (effort / 100)) - 1
            Removed_Edge = jaccard_list[0:max_number_of_edges]

    return Removed_Edge
Beispiel #31
0
def test_jaccard_two_hop_edge_vals(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    cu_M = utils.read_csv_file(graph_file)

    Gnx = nx.from_pandas_edgelist(M,
                                  source="0",
                                  target="1",
                                  edge_attr="weight",
                                  create_using=nx.Graph())
    G = cugraph.Graph()
    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")

    pairs = (G.get_two_hop_neighbors().sort_values(["first", "second"
                                                    ]).reset_index(drop=True))

    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs["first"].iloc[i], pairs["second"].iloc[i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs)
    df = df.sort_values(by=["source", "destination"]).reset_index(drop=True)
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i])
        assert diff < 1.0e-6
def make_jacc_predG(dirG, testG):
# Given the original digraph dirG, and the testG digraph to test against, 
# Return predicted digraph predG with arcs matching testG,
# each with the predicted lilekihood according to Jaccard coeff
    undG = dirG.to_undirected()
    undir_jaccs = nx.Graph() 
    undir_jaccs.add_weighted_edges_from(nx.jaccard_coefficient(undG, testG.edges_iter()))
    return make_predG_from_jacc(undir_jaccs, dirG, testG)
def Jaccard_coef(features, G):
    J = []
    for i in range(features.shape[0]):
        a = features['From'][i]
        b = features['To'][i]
        pred = nx.jaccard_coefficient(G, [(a, b)])
        for u, v ,p in pred:
            J.append(p)
    return J
Beispiel #34
0
def sort_edges_by_jaccard_index(graph, edges):
    #from random import shuffle
    # shuffle(edges)
    edges_sorted = sorted(list(nx.jaccard_coefficient(graph, edges)), key=lambda l: l[
                          2], reverse=True, cmp=compare_with_ties)

    unique_j_edges = len(np.unique(np.array(edges_sorted)[:, 2]))
    total_j_edges = len(edges)
    print "Degeneracy= ", 1.0 - float(unique_j_edges) / float(total_j_edges)
    return [(row[0], row[1]) for row in edges_sorted], [row[2] for row in edges_sorted]
def calculate_jaccard_similarity(graph):
	preds = nx.jaccard_coefficient(graph,graph.edges())
	hash_new = {}
	for u, v, p in preds:
		hash_new[str(u) + ',' + str(v)] = p
	arr_nodes = (max(hash_new.iteritems(), key=operator.itemgetter(1))[0])
	arr_value = (max(hash_new.iteritems(), key=operator.itemgetter(1))[1])
	print("nodes")
	print(arr_nodes)
	print("Jaccard Coefficient")
	print(arr_value)
def graph_stats(distance_couple, net):
    distances = []
    common_neighbors = []
    jaccard = []
    adamic = []
    edge_bet = []
    edge_betweeness = nx.edge_betweenness_centrality(net)
    for couple in distance_couple:
        distances.append(couple[1])
        common_neighbors.append(len(list(nx.common_neighbors(net, couple[0][0], couple[0][1]))))
        jaccard.append(list(nx.jaccard_coefficient(net, [(couple[0][0], couple[0][1])]))[0][2])
        adamic.append(list(nx.adamic_adar_index(net, [(couple[0][0], couple[0][1])]))[0][2])
        try:
            edge_bet.append(edge_betweeness[couple[0]])
        except KeyError:
            edge_bet.append(edge_betweeness[(couple[0][1], couple[0][0])])

    r_dist = 10.0/max(distances)
    r_n = 10.0/max(common_neighbors)
    r_j = 10.0/max(jaccard)
    r_a = 10.0/max(adamic)
    r_e = 10.0/max(edge_bet)

    distances = [j * r_dist for j in distances]
    common_neighbors = [j * r_n for j in common_neighbors]
    jaccard = [j * r_j for j in jaccard]
    adamic = [j * r_a for j in adamic]
    edge_bet = [j * r_e for j in edge_bet]

    plt.loglog(common_neighbors, color='b', label='common_neighbors')
    plt.loglog(distances, color='r', label='distances')
    plt.savefig('node_similarity/stats_cm.png', format='png')
    plt.close()

    plt.loglog(jaccard, color='b', label='jaccard')
    plt.loglog(distances, color='r', label='distances')
    plt.savefig('node_similarity/stats_j.png', format='png')
    plt.close()

    plt.loglog(adamic, color='b', label='adamic')
    plt.loglog(distances, color='r', label='distances')
    plt.savefig('node_similarity/stats_aa.png', format='png')
    plt.close()

    plt.loglog(edge_bet, color='b', label='edge betwenness')
    plt.loglog(distances, color='r', label='distances')
    plt.savefig('node_similarity/stats_eb.png', format='png')
    plt.close()
	def users_to_recommend(self, nb_reco_user=5):
		"""
		compute the authors to recommend to the user based on link prediction and similarity
		:param nb_reco_user: number of users to recommend
		:return:
		"""
		ebunch = []
		authors = set(self.graph.nodes())
		authors.remove(self.user.id)
		for a in self.authors_liked:
			authors.remove(a)
		for author in authors:
			ebunch.append((self.user.id, author))

		preds = nx.jaccard_coefficient(self.graph, ebunch)
		reco_prio = []
		for u, a, p in preds:
			reco_prio.append({'author_name': a, 'prio': p})

		reco_prio = sorted(reco_prio, key=lambda k: k['prio'], reverse=True)[:nb_reco_user]
		reco_prio = self.rerank_authors(reco_prio)

		return [x['author_name'] for x in reco_prio]
import networkx as nx

G = nx.read_graphml('networkx_graph.graphml')
undirected_G = G.to_undirected()
jaccard_similarity = nx.jaccard_coefficient(undirected_G)
sorted_js = sorted(jaccard_similarity, key=lambda tup:tup[3], reverse=True)
print "most similar nodes by jaccard similarity: " + str(sorted_js[0])
Beispiel #39
0
import networkx as nx 
import matplotlib.pyplot as plt
import operator

G=nx.read_edgelist("anoncsv.csv",'rb',delimiter=',')

#PageRank
c=nx.pagerank(G)
sorted_c=sorted(c,key=operator.itemgetter(2))

#EigenVector Centrality
c=nx.eigenvector_centrality(G)
sorted_c=sorted(c,key=operator.itemgetter(2))

#Degree centrality

c=nx.degree_centrality(G)
sorted_c=sorted(c,key=operator.itemgetter(2))

#jaccard similarity
j=nx.jaccard_coefficient(G)


for u, v, p in j:
	print u, v, p
	print '\n'
## Following code lines find the rank correlation among the 3 centralty values. Values are also stores at rank_correlations.txt file.

f = open("C:\\Users\\tyagi\\Desktop\\Fall2015\\SMM\\Project 1\\Phase 1\\P3\\rank_correlations.txt",'w')
tau,p_value = st.kendalltau(pg,ev)
print("Rank correlation between PG and EV : (Tau = '%f', p_value = '%f')" % (tau,p_value))
f.write("Rank correlation between PG and EV : Tau = "+str(tau)+', p_value = '+str(p_value)+'\n')

tau,p_value = st.kendalltau(pg,deg)
print("Rank correlation between PG and DC : (Tau = '%f', p_value = '%f')" % (tau,p_value))
f.write("Rank correlation between PG and DC : Tau = "+str(tau)+', p_value = '+str(p_value)+'\n')

tau,p_value = st.kendalltau(ev,deg)
print("Rank correlation between EV and DC : (Tau = '%f', p_value = '%f')" % (tau,p_value))
f.write("Rank correlation between EV and DC : Tau = "+str(tau)+', p_value = '+str(p_value)+'\n')
f.close()

# Similarity using JACCARD Coefficient

print("Finding jaccard similarity :")

# jaccard_coefficient() returns an iterator over values u,v,p which are edge nodes and jaccard co-efficient values
# respectively. I create a dictionary jac_dct to store the values. Key are edge tupes and values are jaccrard values.
jac_coef = list(nx.jaccard_coefficient(h, h.edges()))
jac_coef.sort(reverse=True)
print("Top two pair of nodes in terms of jaccard coefficient are :")
i=0
while i<2:
    print(jac_coef[i])
    i+=1

import networkx as nx
import operator
# calculate jaccard coefficient
import csv
fe=open('anonymized_edge_list.csv','rb')

reader=csv.reader(fe)
#column=[]
#c_p=[]
G=nx.Graph()
edge_list=[]
for x in reader:
    edge_list.append(x)
    print x
    G.add_edge(x[0],x[1])


fja=open('jaccard.csv','wb')
writer=csv.writer(fja)

array=nx.jaccard_coefficient(G,edge_list)
for c,v,p in array:

    l=[c,v,p]
    writer.writerow(l)
#    column.append(l)



fja.close()
fe.close()
Beispiel #42
0
def link_prediction(G, query_nodes, target_nodes, n_edges, start_dist, alg = "ra"):
    """Selects a random set of links between based on the scores calculated by 
    a standard link-prediction algorithm from networkx library
    Parameters
    ----------
    G : Networkx graph
        The graph from which the team will be selected.
    query : list 
        The set of nodes from which random walker starts.
    target : list
        The set of nodes from where the random walker ends.
    n_edges : integer
        the number of links to be added
    start_dist: list
        The starting distribution over the query set
    alg: string
        A string describing the link-prediction algorithm to be used
    Returns
    -------
    links : list
        The set of links that reduce the absorbing RW centrality
    ac_scores: list
        The set of scores of adding the links
    """
    assert alg in ["ra", "pa", "jaccard", "aa"], "alg must be one of [\"ra\", \"pa\", \"jaccard\", \"aa\"]."
          
    H = G.copy()
    query_set_size = len(query_nodes)
    map_query_to_org = dict(zip(query_nodes, range(query_set_size)))
    P = csc_matrix(nx.google_matrix(H, alpha=1))
    P_abs = P[list(query_nodes),:][:,list(query_nodes)]
    F = compute_fundamental(P_abs)
    row_sums = start_dist.dot(F.sum())[0,0]
    candidates = list(product(query_nodes, target_nodes))
    eligible = [candidates[i] for i in range(len(candidates)) 
                if H.has_edge(candidates[i][0], candidates[i][1]) == False]
    links_to_add = []
    if alg == 'ra':
        preds = nx.resource_allocation_index(H, eligible)
    elif alg == 'jaccard':
        preds = nx.jaccard_coefficient(H, eligible)
    elif alg == 'aa':
        preds = nx.adamic_adar_index(H, eligible)
    elif alg == 'pa':
        preds = nx.preferential_attachment(H, eligible)
        
    for u,v,p in preds:
        links_to_add.append((u,v,p))
    links_to_add.sort(key=lambda x: x[2], reverse = True)
    
    ac_scores = []
    ac_scores.append(row_sums)
    i = 0
    while i < n_edges:
        F_updated = update_fundamental_mat(F, H, map_query_to_org, links_to_add[i][0])
        H.add_edge(links_to_add[i][0], links_to_add[i][1])
        abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0]
        F = F_updated            
        ac_scores.append(abs_cen)
        i += 1
    return links_to_add, ac_scores
import networkx as nx
import heapq

with open('U_tuples.csv', mode='r') as infile:
	G = nx.read_edgelist(infile, delimiter = ',', create_using=nx.DiGraph())
UG = G.to_undirected()
preds = nx.jaccard_coefficient(UG)
nd = {}
count = 0
check = 10000

def jac_check():
	ndmax = heapq.nlargest(1,nd,nd.get)
	if nd[ndmax[0]] == 1:
		with open('jaccard.txt', mode='wb') as outfile:
			outfile.write("Jaccard Similarity Pair:	"+str(ndmax)+"\n")
			outfile.write("Jaccard Similarity Maximum:	"+str(nd[ndmax[0]])+"\n")

for u,v,p in preds:
	nd[(u,v)] = p
	count+=1
	print 'Jaccard: ' ,count 
	ndmax = []
	if count == check:
		jac_check()
		check += 10000+count
print nd

#ndmax = []
#ndmax = heapq.nlargest(1,nd,nd.get)
#print ndmax