def adamic_adar_scores(g_train, train_test_split):
    if g_train.is_directed(): # Only works for undirected graphs
        g_train = g_train.to_undirected()

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
        test_edges, test_edges_false = train_test_split # Unpack input

    start_time = time.time()
    
    aa_scores = {}

    # Calculate scores
    aa_matrix = np.zeros(adj_train.shape)
    for u, v, p in nx.adamic_adar_index(g_train, ebunch=get_ebunch(train_test_split)): # (u, v) = node indices, p = Adamic-Adar index
        aa_matrix[u][v] = p
        aa_matrix[v][u] = p # make sure it's symmetric
    aa_matrix = aa_matrix / aa_matrix.max() # Normalize matrix

    runtime = time.time() - start_time
    aa_roc, aa_ap = get_roc_score(test_edges, test_edges_false, aa_matrix)

    aa_scores['test_roc'] = aa_roc
    # aa_scores['test_roc_curve'] = aa_roc_curve
    aa_scores['test_ap'] = aa_ap
    aa_scores['runtime'] = runtime
    return aa_scores
def getaa(i):
    t = time.time()
    l = list(nx.adamic_adar_index(g, E[i:i + step]))
    t = time.time() - t
    print('  finished edges [{: 7d}, {: 7d}) in {: 7.2f} s\n'
          .format(i, i + step, t), end='')
    sys.stdout.flush()
    return l
def train_feature(fn_list, input_file='../../output/svm_feature_merge/training.txt(split)'):

    percentage_of_initial_adopter = 0.1

    g = load_graph('../../data/graph.txt')
    b = load_json('../../data/Business.txt')
    b = {b['business_id'][i]: ((b['latitude'][i], b['longitude'][i]), b['stars'][i]) for i in range(len(b['business_id']))}
    u_location = load_user_location('../../data/user_location.txt')
    t = load_idea(input_file)

    for fn in fn_list:
        fn = '../../output/svm_feature_merge/' + fn
        d = __load(fn)
        with open(fn+'(f0).txt', 'w') as f0:
            with open(fn+'(f1).txt', 'w') as f1:
                with open(fn+'(f2).txt', 'w') as f2:
                    with open(fn+'(f3).txt', 'w') as f3:
                        for k, v in d.items():
                            print k
                            user = [q for q, qq in v]
                            label = [qq for q, qq in v]

                            n = get_node_by_idea2(t, k)
                            print len(n['date'])
                            index = sorted(range(len(n['date'])), key=lambda k: n['date'][k])
                            # level = [n['level'][i] for i in index]
                            node = [n['node'][i] for i in index]
                            initial_adopters = node[: int(percentage_of_initial_adopter*len(node))]

                            assert(len(set(user) & set(initial_adopters)) == 0)
                            # b_avg_stars = np.average([level[: 0.1*len(level)]])

                            for e, lb in zip(user,label):
                                features = []
                                # Distance with business
                                features.append(__distance(b[k][0], u_location[e]))

                                # Difference between user average stars and business average stars by adopters

                                # Average distance with initial adopters
                                features.append(np.average([__distance(u_location[a], u_location[e]) for a in initial_adopters]))

                                # Percentage of friends in initial adopters
                                features.append(float(sum([1 if g.has_edge(e, a) else 0 for a in initial_adopters]))/len(initial_adopters))

                                # Average adar
                                preds = nx.adamic_adar_index(g, [(e, a) for a in initial_adopters if g.has_node(a) and g.has_node(e)])
                                try:
                                    preds = [p for u, v, p in preds]
                                    preds = preds if preds else 0
                                except Exception:
                                    preds = 0
                                features.append(np.average(preds))

                                f0.write('{0} {1} {2} {3}\n'.format(k, e, features[0], lb))
                                f1.write('{0} {1} {2} {3}\n'.format(k, e, features[1], lb))
                                f2.write('{0} {1} {2} {3}\n'.format(k, e, features[2], lb))
                                f3.write('{0} {1} {2} {3}\n'.format(k, e, features[3], lb))
Esempio n. 4
0
def predictLinksAdamicAdar(nodesAtHop, itemNodeIds, userNodeIds, directory, item):
    scores = {}
    GCombined = nx.read_edgelist(directory + 'Edge_List_Combined_' + item + '.txt')
    preds = nx.adamic_adar_index(GCombined)
        
    for u, v, p in preds:
        if not u in scores:
            scores[u] = {}
        scores[u][v] = p
    
    with open(directory + 'AdamicAdar', 'wb') as outfile:
        pickle.dump(scores, outfile)
def graph_stats(distance_couple, net):
    distances = []
    common_neighbors = []
    jaccard = []
    adamic = []
    edge_bet = []
    edge_betweeness = nx.edge_betweenness_centrality(net)
    for couple in distance_couple:
        distances.append(couple[1])
        common_neighbors.append(len(list(nx.common_neighbors(net, couple[0][0], couple[0][1]))))
        jaccard.append(list(nx.jaccard_coefficient(net, [(couple[0][0], couple[0][1])]))[0][2])
        adamic.append(list(nx.adamic_adar_index(net, [(couple[0][0], couple[0][1])]))[0][2])
        try:
            edge_bet.append(edge_betweeness[couple[0]])
        except KeyError:
            edge_bet.append(edge_betweeness[(couple[0][1], couple[0][0])])

    r_dist = 10.0/max(distances)
    r_n = 10.0/max(common_neighbors)
    r_j = 10.0/max(jaccard)
    r_a = 10.0/max(adamic)
    r_e = 10.0/max(edge_bet)

    distances = [j * r_dist for j in distances]
    common_neighbors = [j * r_n for j in common_neighbors]
    jaccard = [j * r_j for j in jaccard]
    adamic = [j * r_a for j in adamic]
    edge_bet = [j * r_e for j in edge_bet]

    plt.loglog(common_neighbors, color='b', label='common_neighbors')
    plt.loglog(distances, color='r', label='distances')
    plt.savefig('node_similarity/stats_cm.png', format='png')
    plt.close()

    plt.loglog(jaccard, color='b', label='jaccard')
    plt.loglog(distances, color='r', label='distances')
    plt.savefig('node_similarity/stats_j.png', format='png')
    plt.close()

    plt.loglog(adamic, color='b', label='adamic')
    plt.loglog(distances, color='r', label='distances')
    plt.savefig('node_similarity/stats_aa.png', format='png')
    plt.close()

    plt.loglog(edge_bet, color='b', label='edge betwenness')
    plt.loglog(distances, color='r', label='distances')
    plt.savefig('node_similarity/stats_eb.png', format='png')
    plt.close()
def train_feature(input_file='../../data/training.txt(split)', output='../../output/svm_training.txt'):
    percentage_of_initial_adopter = 0.1

    g = load_graph('../../data/graph.txt')
    b = load_json('../../data/Business.txt')
    b = {b['business_id'][i]: ((b['latitude'][i], b['longitude'][i]), b['stars'][i]) for i in range(len(b['business_id']))}
    u_location = load_user_location('../../data/user_location.txt')

    t = load_idea(input_file)
    ideas = list(set(t['idea']))

    l = None
    for m in range(len(ideas)):
        print m
        n = get_node_by_idea2(t, ideas[m])
        index = sorted(range(len(n['date'])), key=lambda k: n['date'][k])
        # level = [n['level'][i] for i in index]
        node = [n['node'][i] for i in index]
        initial_adopters = node[: int(percentage_of_initial_adopter*len(node))]
        laters = node[int(percentage_of_initial_adopter*len(node)):]
        # b_avg_stars = np.average([level[: 0.1*len(level)]])

        for e in laters:
            features = []
            # Distance with business
            features.append(__distance(b[ideas[m]][0], u_location[e]))

            # Difference between user average stars and business average stars by adopters

            # Average distance with initial adopters
            features.append(np.average([__distance(u_location[a], u_location[e]) for a in initial_adopters]))

            # Percentage of friends in initial adopters
            features.append(float(sum([1 if g.has_edge(e, a) else 0 for a in initial_adopters]))/len(initial_adopters))

            # Average adar
            preds = nx.adamic_adar_index(g, [(e, a) for a in initial_adopters if g.has_node(a) and g.has_node(e)])
            try:
                preds = [p for u, v, p in preds]
                preds = preds if preds else 0
            except Exception:
                preds = 0
            features.append(np.average(preds))

            features = np.array([features])
            l = features if l is None else np.concatenate((l, features))

    dump_svmlight_file(l, [1]*l.shape[0], output)
def test_feature(input='../../data/test_data/test_data_q1.txt', output='../../output/svm_testing_q1_'):
    g = load_graph('../../data/graph.txt')
    b = load_json('../../data/Business.txt')
    b = {b['business_id'][i]: ((b['latitude'][i], b['longitude'][i]), b['stars'][i]) for i in range(len(b['business_id']))}
    u_location = load_user_location('../../data/user_location.txt')
    t = load_idea('../../data/testing.txt')

    with open('../../data/testing_business.txt', 'r') as f:
        tb = f.read().strip().split()
    with open(input, 'r') as f:
        test = [[e for e in l.strip().split()] for l in f]


    for i, (business, initial_adopters) in enumerate(zip(tb, test)):
        print i
        l = None
        answers = []

        n = get_node_by_idea2(t, business)
        ans = set(n['node'])-set(initial_adopters)
        candidates = set(g.nodes())-set(initial_adopters)
        for e in candidates:
            features = []
            # Distance with business
            features.append(__distance(b[business][0], u_location[e]))

            # Difference between user average stars and business average stars by adopters

            # Average distance with initial adopters
            features.append(np.average([__distance(u_location[a], u_location[e]) for a in initial_adopters]))

            # Percentage of friends in initial adopters
            features.append(float(sum([1 if g.has_edge(e, a) else 0 for a in initial_adopters]))/len(initial_adopters))

            # Average adar
            preds = nx.adamic_adar_index(g, [(e, a) for a in initial_adopters if g.has_node(a) and g.has_node(e)])
            try:
                preds = [p for u, v, p in preds]
                preds = preds if preds else 0
            except Exception:
                preds = 0
            features.append(np.average(preds))

            features = np.array([features])
            l = features if l is None else np.concatenate((l, features))
            answers.append(1 if e in ans else 0)

        dump_svmlight_file(l, answers, output+str(i)+'.txt')
def calculate_similarities(E1, tmp_string=None):
    features = []
    ytarget = []
    tmp_feat = {}
    tmp_y = {}
    temp_common_neighbors = []
    node_list = list(E1.nodes)
    for node1 in node_list:
        for node2 in node_list:
            connected = 0
            if tmp_string == "n":
                if nodes_connected(E1, node1, node2):
                    continue
                else:
                    temp = nx.jaccard_coefficient(E1, [(node1, node2)])
                    for u, v, p in temp:
                        tmp_feat[u, v] = [p]
                        tmp_y[u, v] = connected

                    temp = nx.adamic_adar_index(E1, [(node1, node2)])
                    try:
                        for u, v, p in temp:
                            tmp_feat[u, v].append(p)
                            tmp_y[u, v] = connected
                    except:
                        tmp_feat[u, v].append(0.0)
                        tmp_y[u, v] = connected

                    temp = nx.preferential_attachment(E1, [(node1, node2)])
                    for u, v, p in temp:
                        tmp_feat[u, v].append(p)
                        tmp_y[u, v] = connected
                    temp = sorted(nx.common_neighbors(E1, node1, node2))
                    temp_common_neighbors = []
                    temp = sorted(nx.common_neighbors(E1, node1, node2))
                    for common_neighbor in temp:
                        temp_common_neighbors.append(common_neighbor)
                    tmp_feat[node1, node2].append(len(temp_common_neighbors))

                    #Ill use Exception Checking cause we have DiGraph and maybe there is no path between to nodes
                    #so an error would compile
                    try:
                        length = nx.shortest_path_length(E1, node1, node2)
                        tmp_feat[node1, node2].append(length)
                        tmp_y[node1, node2] = connected
                    except:
                        tmp_feat[node1, node2].append(0.0)
                        tmp_y[node1, node2] = connected

            else:
                if nodes_connected(E1, node1, node2):
                    connected = 1
                temp = nx.jaccard_coefficient(E1, [(node1, node2)])
                for u, v, p in temp:
                    tmp_feat[u, v] = [p]
                    tmp_y[u, v] = connected

                temp = nx.adamic_adar_index(E1, [(node1, node2)])
                try:
                    for u, v, p in temp:
                        tmp_feat[u, v].append(p)
                        tmp_y[u, v] = connected
                except:
                    tmp_feat[u, v].append(0.0)
                    tmp_y[u, v] = connected

                temp = nx.preferential_attachment(E1, [(node1, node2)])
                for u, v, p in temp:
                    tmp_feat[u, v].append(p)
                    tmp_y[u, v] = connected
                temp = sorted(nx.common_neighbors(E1, node1, node2))
                temp_common_neighbors = []
                temp = sorted(nx.common_neighbors(E1, node1, node2))
                for common_neighbor in temp:
                    temp_common_neighbors.append(common_neighbor)
                tmp_feat[node1, node2].append(len(temp_common_neighbors))

                #Ill use Exception Checking cause we have DiGraph and maybe there is no path between to nodes
                #so an error would compile
                try:
                    length = nx.shortest_path_length(E1, node1, node2)
                    tmp_feat[node1, node2].append(length)
                    tmp_y[node1, node2] = connected
                except:
                    tmp_feat[node1, node2].append(0.0)
                    tmp_y[node1, node2] = connected

    for key, value in tmp_feat.items():
        features.append(value)
    for key, value in tmp_y.items():
        ytarget.append(value)

    features_numpy = numpy.array(features)
    y_target_numpy = numpy.array(ytarget)

    return features_numpy, y_target_numpy
Esempio n. 9
0
                    G, [(i, j)]))  # do >0
                # print(prediction2)
                # print(prediction2[0][2])
                # prediction2 = prediction2[0][2]
                # print('Model2 Pred:' + str(prediction2))

                ########## Jaccard Coefficient ##########
                prediction3 = sorted(nx.jaccard_coefficient(G,
                                                            [(i, j)]))  # do >0
                # print(prediction3)
                # print(prediction3[0][2])
                # prediction3 = prediction3[0][2]
                # print('Model3 Pred:' + str(prediction3))

                ########## Adamic Adar Index ##########
                prediction4 = sorted(nx.adamic_adar_index(G,
                                                          [(i, j)]))  # do >0
                # print(prediction4)
                # print(prediction4[0][2])
                # prediction4 = prediction4[0][2]
                # print('Model4 Pred:' + str(prediction4))

                ########## Prederential Attachment ##########
                prediction5 = sorted(nx.preferential_attachment(
                    G, [(i, j)]))  # do >0
                # print(prediction5)
                # print(prediction5[0][2])
                # prediction5 = prediction5[0][2]
                # print('Model5 Pred:' + str(prediction5))

                ########## Tried couple different ways to get the score up ##########
                ########## Really thought something like this would work well ##########
 def predict(self, node_pairs):
     predictions = adamic_adar_index(self.graph, node_pairs)
     return list(predictions)
Esempio n. 11
0
def link_prediction(G, query_nodes, target_nodes, n_edges, start_dist, alg = "ra"):
    """Selects a random set of links between based on the scores calculated by 
    a standard link-prediction algorithm from networkx library
    Parameters
    ----------
    G : Networkx graph
        The graph from which the team will be selected.
    query : list 
        The set of nodes from which random walker starts.
    target : list
        The set of nodes from where the random walker ends.
    n_edges : integer
        the number of links to be added
    start_dist: list
        The starting distribution over the query set
    alg: string
        A string describing the link-prediction algorithm to be used
    Returns
    -------
    links : list
        The set of links that reduce the absorbing RW centrality
    ac_scores: list
        The set of scores of adding the links
    """
    assert alg in ["ra", "pa", "jaccard", "aa"], "alg must be one of [\"ra\", \"pa\", \"jaccard\", \"aa\"]."
          
    H = G.copy()
    query_set_size = len(query_nodes)
    map_query_to_org = dict(zip(query_nodes, range(query_set_size)))
    P = csc_matrix(nx.google_matrix(H, alpha=1))
    P_abs = P[list(query_nodes),:][:,list(query_nodes)]
    F = compute_fundamental(P_abs)
    row_sums = start_dist.dot(F.sum())[0,0]
    candidates = list(product(query_nodes, target_nodes))
    eligible = [candidates[i] for i in range(len(candidates)) 
                if H.has_edge(candidates[i][0], candidates[i][1]) == False]
    links_to_add = []
    if alg == 'ra':
        preds = nx.resource_allocation_index(H, eligible)
    elif alg == 'jaccard':
        preds = nx.jaccard_coefficient(H, eligible)
    elif alg == 'aa':
        preds = nx.adamic_adar_index(H, eligible)
    elif alg == 'pa':
        preds = nx.preferential_attachment(H, eligible)
        
    for u,v,p in preds:
        links_to_add.append((u,v,p))
    links_to_add.sort(key=lambda x: x[2], reverse = True)
    
    ac_scores = []
    ac_scores.append(row_sums)
    i = 0
    while i < n_edges:
        F_updated = update_fundamental_mat(F, H, map_query_to_org, links_to_add[i][0])
        H.add_edge(links_to_add[i][0], links_to_add[i][1])
        abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0]
        F = F_updated            
        ac_scores.append(abs_cen)
        i += 1
    return links_to_add, ac_scores
        print('Reading %s_topological_network.csv...' %
              (prog_languages[prog_lang_id]))
        t_network = []

        for row in data:
            dev_id_1 = int(row[0])
            dev_id_2 = int(row[1])

            t_network.append((dev_id_1, dev_id_2))
    csvfile.close()

    with open('../Files/topological_metrics.csv', 'a') as a:
        metrics_file = csv.writer(a, delimiter=',')

        print('Writing topological metrics for', prog_languages[prog_lang_id])

        for dev_pair in t_network:
            neighborhood_overlap = nx.jaccard_coefficient(G, [dev_pair])
            adamic_acar = nx.adamic_adar_index(G, [dev_pair])
            preferential_attachment = nx.preferential_attachment(G, [dev_pair])

            for u, v, p in neighborhood_overlap:
                NO = p
            for u, v, p in adamic_acar:
                AA = p
            for u, v, p in preferential_attachment:
                PA = p

            metrics_file.writerow(
                [prog_lang_id, dev_pair[0], dev_pair[1], NO, AA, PA])
    a.close()
Esempio n. 13
0
def calc_indexes(G, test_edges, aa=False):
    """
	Input - 
	G  - is the graph based upon which we calculate the indexes
	test_edges -  are the edges for which we calculate the index 
	aa - whether to calculate the adamic adder index or not 

	Calculates all the indixes of the graph for the test_edgessuch as 
	1) Common Neighbors
	2) Jaccard
	3) Preferential Attachment
	4) Adamic Adder
	"""

    #All indexes stored as a dictionary
    indexes = {}

    #Initiazing
    jaccard_arr = []
    common_arr = []
    preferential_arr = []
    adamic_adder = []

    aa1 = nx.adamic_adar_index(G, test_edges)
    jc = nx.jaccard_coefficient(G, test_edges)
    pa = nx.preferential_attachment(G, test_edges)

    for edge in test_edges:

        #Loading the nodes
        node1 = edge[0]
        node2 = edge[1]
        node_list = [node1, node2]
        #Neighbors of the nodes
        node1_neighbors = set(list(G.neighbors(node1)))
        node2_neighbors = set(list(G.neighbors(node2)))
        #Union of the neighbors
        union_neighbors = list(node1_neighbors.union(node2_neighbors))
        #Intersection of the neighbors
        intersection_neighbors = list(
            node1_neighbors.intersection(node2_neighbors))

        #Jaccard Index
        jaccard_index = 0

        if len(union_neighbors) != 0:
            jaccard_index = len(intersection_neighbors) / len(union_neighbors)

        #Common Neighbors
        common_neighbors = len(intersection_neighbors)
        #Preferential Attachment
        preferential_attachment = len(node1_neighbors) * len(node2_neighbors)

        #Appending the indexes into the arrays
        jaccard_arr.append(jaccard_index)
        common_arr.append(common_neighbors)
        preferential_arr.append(preferential_attachment)

        #if adamic_adder is to be calculated
        if aa == True:
            adamic_add = 0
            for neighbor in intersection_neighbors:
                nfneighbors_of_neighbors = len(list(G.neighbors(neighbor)))
                #Adamic adder doesn't make sense for nfneighbors_of_neighbors == 0 or 1
                if nfneighbors_of_neighbors != 1 and nfneighbors_of_neighbors != 0:
                    aa_score = 1 / np.log(nfneighbors_of_neighbors)
                    adamic_add += aa_score
            adamic_adder.append(adamic_add)

    indexes['jaccard_index'] = jaccard_arr
    indexes['common_neighbors'] = common_arr
    indexes['preferential_attachment'] = preferential_arr
    indexes['adamic_adder'] = adamic_adder

    # print(adamic_adder[:10],aa[:10])
    # print(jaccard_arr[:10],jc[:10])
    # print(preferential_arr[:10],pa[:10])
    #print(indexes['adamic_adder'])
    #print(adamic_adder)
    #print(jaccard_arr)

    # for u, v, p in jc:
    # 	print(u, v, p)
    # 	ind = test_edges.index((u,v))
    # 	print(jaccard_arr[ind],ind)

    # for u, v, p in pa:
    # 	print(u, v, p)
    # 	ind = test_edges.index((u,v))
    # 	print(preferential_arr[ind],ind)

    # for u, v, p in aa1:
    # 	print(u, v, p)
    # 	ind = test_edges.index((u,v))
    # 	print(adamic_adder[ind],ind)

    return indexes
Esempio n. 14
0
    auth2.append(b)

length = len(auth1)
print length

for i in range(0, length):
    for j in range(0, length):
        lab = 0
        if (auth1[i] != auth2[j]):

            fet = ''
            d = 0
            f = 0

            pred1 = nx.adamic_adar_index(g, [(auth1[i], auth2[j])])
            pred2 = nx.jaccard_coefficient(g, [(auth1[i], auth2[j])])
            pred3 = nx.preferential_attachment(g, [(auth1[i], auth2[j])])

            for item in pred1:
                x, y, z = item
                if (z == 0.0):
                    d = 1
                fet = fet + str(z) + ' '
                #print x + ' '+ y +' ' + str(z)

            for item in pred2:
                x, y, z = item
                if (z == 0.0):
                    f = 1
                fet = fet + str(z) + ' '
Esempio n. 15
0
    def get_feature(source, target):

        features = {}

        def set_feature(name, val):
            if name not in features:
                features[name] = val

        def cosine_distance(node_list1, node_list2):
            id2index = dict([
                (id, i) for i, id in enumerate(set(node_list1 + node_list2))
            ])
            a = np.zeros((len(id2index), ))
            b = np.zeros((len(id2index), ))
            for key in node_list1:
                a[id2index[key]] = 1
            for key in node_list2:
                b[id2index[key]] = 1
            #return distance.cosine(a, b)

        try:
            source_succ = set(digraph.successors(source))
            source_pred = set(digraph.predecessors(source))
            target_succ = set(digraph.successors(target))
            target_pred = set(digraph.predecessors(target))
            set_feature('len_source_successors', len(source_succ))
            set_feature('len_target_successors', len(target_succ))
            set_feature('len_source_predecessors', len(source_pred))
            set_feature('len_target_predecessors', len(target_pred))
            common_succ = len(source_succ.intersection(target_succ))
            common_pred = len(source_pred.intersection(source_pred))
            set_feature('common_successor_number', common_succ)
            set_feature('common_predecessor_number', common_pred)
            succ_union = source_succ.union(target_succ)
            pred_union = source_pred.union(target_pred)
            set_feature(
                'jaccard_distance_between_successors',
                common_succ / len(succ_union) if len(succ_union) != 0 else 0)
            set_feature(
                'jaccard_distance_between_predecessors',
                common_pred / len(pred_union) if len(pred_union) != 0 else 0)

            #set_feature('successor_cosine', cosine_distance(data[source], data[target]))
            #set_feature('predecessor_cosine', cosine_distance(source_pred, target_pred))

            set_feature(
                'shortest_path',
                nx.shortest_path_length(digraph, source, target)
                if digraph.has_edge(source, target) else 0)
            pref_attch = nx.preferential_attachment(graph, [(source, target)])
            for u, v, p in pref_attch:
                set_feature('preference_attachment',
                            p)  # if graph.has_edge(source, target) else 0)
            aa_index = nx.adamic_adar_index(graph, [(source, target)])
            for u, v, p in aa_index:
                set_feature('adamic_adar_index',
                            p)  # if graph.has_edge(source, target) else 0)
            jcd_coe = nx.jaccard_coefficient(graph, [(source, target)])
            for u, v, p in jcd_coe:
                set_feature('jaccard_coefficient',
                            p)  # if graph.has_edge(source, target) else 0)
            reallo_index = nx.resource_allocation_index(
                graph, [(source, target)])
            for u, v, p in reallo_index:
                set_feature('resource_allocation_index',
                            p)  # if graph.has_edge(source, target) else 0)
            set_feature('cluster_source', nx.clustering(graph, source))
            set_feature('cluster_target', nx.clustering(graph, target))

            set_feature('source_pagerank', pagerank[source])
            set_feature('target_pagerank', pagerank[target])
            set_feature('source_authorities', auth[source])
            set_feature('target_authorities', auth[target])
            set_feature('source_hubs', hub[source])
            set_feature('target_hubs', hub[target])
            #set_feature('source_core_num', core[source])
            #set_feature('target_core_num', core[target])
        except:
            pass
        return features
print("Number of edges deleted : %d" % edge_subset_G_size)
print("Number of edges remaining : %d" % (t - edge_subset_G_size))

#6 Create a train set of 80 percent from G_test
edges_to_remove_from_G_test = 0.201
removed_edges_from_G_test = random.sample(
    G_test.edges(),
    int(edges_to_remove_from_G_test * G_test.number_of_edges()))
G_train = G_test.copy()
G_train.remove_edges_from(removed_edges_from_G_test)
edge_subset_G_test_size = len(list(removed_edges_from_G_test))
print("Number of edges deleted : %d" % edge_subset_G_test_size)
print("Number of edges remaining : %d" %
      (t - edge_subset_G_size - edge_subset_G_test_size))

#6 Transform G_train and G_test to undirected
G_train = G_train.to_undirected()
G_test = G_test.to_undirected()

#7 Calculate AA AUC
pred_aa_train = list(nx.adamic_adar_index(G_train))
pred_aa_test = list(nx.adamic_adar_index(G_test))
score_aa, label_aa = zip(*[(s, (u, v) in removed_edges_from_G)
                           for (u, v, s) in pred_aa_test])
auc_aa = roc_auc_score(label_aa, score_aa)

#8 Print AUC and prediciton calculation time
t2 = datetime.now()
delta = t2 - t1
print(auc_aa, delta.seconds)
Esempio n. 17
0
def make_adamic_adar_index_predG(dirG, testG):
    undG = dirG.to_undirected()
    undir_AAs = nx.Graph() 
    undir_AAs.add_weighted_edges_from(nx.adamic_adar_index(undG, testG.edges_iter()))
    return make_predG_from_jacc(undir_AAs, dirG, testG)
random.shuffle(mytrain)
del test[0]
g = nx.Graph()
connect = []
for i in range(20000):
    t = train[i].split()
    for j in range(len(t)):
        g.add_edge(t[0], t[j])

for i in range(19999):
    t1 = train[i].split()
    t2 = train[i + 1].split()
    if g.has_edge(t1[0], t2[0]):
        connect.append([
            len(list(nx.common_neighbors(g, t1[0], t2[0]))),
            list(nx.adamic_adar_index(g, [(t1[0], t2[0])]))[0][2],
            list(nx.preferential_attachment(g, [(t1[0], t2[0])]))[0][2],
            list(nx.jaccard_coefficient(g, [(t1[0], t2[0])]))[0][2],
            list(nx.resource_allocation_index(g, [(t1[0], t2[0])]))[0][2], 1
        ])
    else:
        connect.append([
            len(list(nx.common_neighbors(g, t1[0], t2[0]))),
            list(nx.adamic_adar_index(g, [(t1[0], t2[0])]))[0][2],
            list(nx.preferential_attachment(g, [(t1[0], t2[0])]))[0][2],
            list(nx.jaccard_coefficient(g, [(t1[0], t2[0])]))[0][2],
            list(nx.resource_allocation_index(g, [(t1[0], t2[0])]))[0][2], 0
        ])

for i in range(300000):
    t0, t1, t2 = mytrain[i].split()
Esempio n. 19
0
def adamic_adar(graph):
  output_file = open("data/imdb_b__adamic_adar", "w")
  for (i, (u, v, score)) in enumerate(nx.adamic_adar_index(graph, graph.edges_iter())): 
    print i
    output_file.write("\t".join(map(str, [u, v, score])) + "\n") 
  output_file.close()
Esempio n. 20
0
    
    #plt.plot(fpr, tpr, marker='.',label=leg)
    
    plt.plot(recall,precision,marker='.',label=leg)
    plt.legend()
    return roc_score, ap_score,precision,recall


# ## 3. Adamic-Adar

# In[7]:


# Compute Adamic-Adar indexes from g_train
aa_matrix = np.zeros(adj.shape)
for u, v, p in nx.adamic_adar_index(g_train): # (u, v) = node indices, p = Adamic-Adar index
    aa_matrix[u][v] = p
    aa_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
aa_matrix = aa_matrix / aa_matrix.max()


# In[8]:


# Calculate ROC AUC and Average Precision
k=0
aa_roc, aa_ap,precision,recall= get_roc_score1(test_edges, test_edges_false, aa_matrix)

print ('Adamic-Adar Test ROC score: ', str(aa_roc))
Esempio n. 21
0
# Drawing the Graph using matplotlib
nx.draw_networkx(UG, node_color=['red'], with_labels=True)
plt.show()

all_nodes = []
nodes = list(nx.nodes(UG))

# loop to append the pair of nodes to a list
for x in nodes:
    for y in nodes:
        if (x != y and not all_nodes.__contains__({x,y})):
            all_nodes.append({x,y})

# Implementing Adamic-Adar
adamic_adar = nx.adamic_adar_index(UG, all_nodes) # the graph and the pair of nodes as parameters

# Print the values
print(" \nAdamic-Adar implementation \n")
max=0
for u, v, p in adamic_adar:
    if(p>max):
        max = p
        sim_u = u
        sim_v = v
    print ('{}, {} -> {:.5f}'.format(u,v,p))

print (f'\nThe most similar according Adamic-Adar: {sim_u}, {sim_v} -> {max:.5f}')


# According to this results the highest Adamic-Adar similarity is between u2 and u3
Esempio n. 22
0
        linklist.append([int(data[0]), int(data[1])])
        nodepair_set[random.randint(0, n_folds - 1)].append([int(data[0]), int(data[1])])
        # new_line = data[0] + ' ' + data[1] + ' 1\n'
        # f_w.write(new_line)

    train_list = []
    for templist in nodepair_set[0:8]:
        train_list = train_list + templist
    test_list = nodepair_set[9]
    nodelist = create_vertex(linklist)
    train_adj = create_adjmatrix(train_list, nodelist)
    test_adj = create_adjmatrix(test_list, nodelist)
    # print(train_adj)
    sim_cn = np.dot(train_adj, train_adj)
    sim_bifan = np.dot(np.dot(train_adj, train_adj.T), train_adj)
    sim_AA = nx.adamic_adar_index(train_adj)
    sim_RA = AA(train_adj)
    sim_IP = IP(train_adj, 0.8)
    # sim_jaccard = Jaccard(train_adj)
    # cn_score_1 = AUC.Calculation_AUC(train_adj, test_adj, sim_cn, len(nodelist))
    # cn_score_2 = evaluationMetric.cal_AUC(train_adj, test_adj, sim_cn, 10000)
    cn_score_3 = metric.auc_score(sim_cn, test_adj, train_adj,'cc')
    AA_score_3 = metric.auc_score(sim_AA, test_adj, train_adj, 'cc')
    RA_score_3 = metric.auc_score(sim_RA, test_adj, train_adj, 'cc')

    bifan_score_3 = metric.auc_score(sim_bifan, test_adj, train_adj,'cc')

    IP_score = metric.auc_score(sim_IP, test_adj, train_adj, 'cc')

    print(cn_score_3)
    print(AA_score_3)
Esempio n. 23
0
rr = []
# for i in corenodes:
#     if i not in dataset:
#         print(i)
# for test
# for i in corenodes:
#    preds = nx.adamic_adar_index(G,nonedges(G,i))
#    tenlargest = heapq.nlargest(100, preds, key = lambda x: x[2])
#    for j in tenlargest:
#        rr.append(j)
# print(len(rr)) #==21550
# result= heapq.nlargest(4000, rr, key = lambda x: x[2])

#for val
for i in val:
    preds = nx.adamic_adar_index(G, nonedges(G, i))
    tenlargest = heapq.nlargest(1000, preds, key=lambda x: x[2])
    for j in tenlargest:
        rr.append(j)
print(len(rr))  #==21550
result = heapq.nlargest(2000, rr, key=lambda x: x[2])

endtime = datetime.datetime.now()
print("time", (endtime - starttime).seconds)

count = 0
for i in result:
    if i[2] == 0:
        count += 1
print(count)
print(len(result))
Esempio n. 24
0
N = G.number_of_nodes()
nodelist = list(G.nodes())

print(nx.info(G))
print(nx.number_of_nodes(G))
print(nx.number_of_edges(G))
print(nx.is_directed(G))


def nonedges(G, u):  #a generator with (u,v) for every non neighbor v
    for v in nx.non_neighbors(G, u):
        yield (u, v)


for u in G.nodes():
    adar = nx.adamic_adar_index(G, nonedges(G, u))
    for v in nx.non_neighbors(G, u):
        com = nx.common_neighbors(G, u, v)
    jac = nx.jaccard_coefficient(G, nonedges(G, u))
    res = nx.resource_allocation_index(G, nonedges(G, u))
    pre = nx.preferential_attachment(G, nonedges(G, u))

allm = {m: {} for m in methods}
toponame = "datasetFourSquare" + ".csv"
with open(toponame, "w", newline="",
          encoding="utf-8") as f:  # binary mode for windows \r\n prob
    writer = csv.writer(f, delimiter=',')
    writer.writerow(['node1', 'node2'] + methods + ['label'])

    for p in total_edges:
        n1, n2 = p
# with open("adj_matrix.csv", "wb") as f:
#     writer = csv.writer(f)
#     writer.writerows(AJ_matrix)

AJ_matrix = nx.adjacency_matrix(G)

#print AJ_matrix(0,0)
print AJ_matrix.shape
# for ii in range(0, 1):
# 	tmp_missing_links = []
# 	for jj in range(0, len(node_sets)):
# 		if AJ_matrix[ii,jj] == 0:
# 			tmp_missing_links.append((int(final_nodes[ii]),int(final_nodes[jj])))
# 			print (int(final_nodes[ii]),int(final_nodes[jj]))
preds = nx.adamic_adar_index(G, ebunch=None)

print max(need_to_check)
#initial_node = 1
missing_links = []
old_u = 1
tmp_missing_links_score = []
tmp_missing_links_u = []
tmp_missing_links_v = []
for u, v, score in preds:

    if int(u) in need_to_check[0:1000]:
        #print u, old_u
        if int(u) == old_u:
            #print executed
            tmp_missing_links_score.append(score)
    def set_edge_weight(self, edge_weight_method='weight'):

        if edge_weight_method == 'weight':
            return

        # Centrality based methods

        elif edge_weight_method == 'edge_betweenness_centrality':
            print("comptuing edge_betweenness_centrality..")
            C = nx.edge_betweenness_centrality(self.G, weight='weight')
            print("done!")

        elif edge_weight_method == 'edge_betweenness_centrality_subset':
            print("comptuing edge_betweenness_centrality_subset..")
            C = nx.edge_current_flow_betweenness_centrality(self.G,
                                                            weight='weight')
            print('done')

        elif edge_weight_method == 'edge_current_flow_betweenness_centrality_subset':
            print(
                "comptuing edge_current_flow_betweenness_centrality_subset..")
            C = nx.edge_current_flow_betweenness_centrality_subset(
                self.G, weight='weight')
            print('done')

        elif edge_weight_method == 'edge_load_centrality':
            print("comptuing edge_load_centrality..")
            C = nx.edge_load_centrality(self.G)
            print('done!')

        # Link Prediction based methods

        elif edge_weight_method == 'adamic_adar_index':
            print("comptuing adamic_adar_index ..")
            preds = nx.adamic_adar_index(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        elif edge_weight_method == 'ra_index_soundarajan_hopcroft':
            print("comptuing ra_index_soundarajan_hopcroft ..")
            preds = nx.ra_index_soundarajan_hopcroft(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        elif edge_weight_method == 'preferential_attachment':
            print("comptuing preferential_attachment ..")
            preds = nx.preferential_attachment(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        #elif edge_weight_method=='cn_soundarajan_hopcroft':
        #    print("comptuing cn_soundarajan_hopcroft ..")
        #    preds=nx.cn_soundarajan_hopcroft(self.G,self.G.edges())
        #    C={}
        #    for u, v, p in preds:
        #        C[(u,v)]=p

        elif edge_weight_method == 'within_inter_cluster':
            print("comptuing within_inter_cluster ..")
            preds = nx.within_inter_cluster(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        elif edge_weight_method == 'resource_allocation_index':
            print("comptuing resource allocation index ..")
            preds = nx.resource_allocation_index(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        elif edge_weight_method == 'jaccard_coefficient':
            print("comptuing jaccard_coefficient..")
            preds = nx.jaccard_coefficient(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

            print('done!')

        for u, v, d in self.G.edges(data=True):
            if edge_weight_method == None:
                d['weight'] = 1
            else:

                d['weight'] = C[(u, v)]

        return 1
Esempio n. 27
0
 def get_edge_weight(self, i, j):
     aa_index = nx.adamic_adar_index(self._G, [(i, j)])
     return six.next(aa_index)[2]
nodes = nx.nodes(DG)
edges = nx.edges(DG)
non_edges = nx.non_edges(DG)
'''Compute HAA, HJC and HRA'''
HAA = []
HJC = []
HRA = []
SD = []
for e in test_pairs:
    if not DG.has_node(e[0]):
        DG.add_node(e[0])
        UDG.add_node(e[0])
    if not DG.has_node(e[1]):
        DG.add_node(e[1])
        UDG.add_node(e[1])
    AA = nx.adamic_adar_index(UDG, [e])
    JC = nx.jaccard_coefficient(UDG, [e])
    RA = nx.resource_allocation_index(UDG, [e])
    spec_diff = DG.in_degree(e[1]) - DG.in_degree(
        e[0])  # specificity_difference
    SD.append(spec_diff)
    try:
        for u, v, p in AA:
            HAA.append(p)
    except ZeroDivisionError:
        HAA.append(0)
        pass

    try:
        for u, v, p in JC:
            HJC.append(p)
 positive_predictions_proba_slpc_DegCent = []
 positive_predictions_proba_slpc_EigenCent = []
 positive_predictions_proba_slpc_ClosenessCent = []
 positive_predictions_proba_slpc_BetweenCent = []
 positive_predictions_proba_slpc_PageRank = []
 lenedg = len(pedges)
 cntr = 0
 for edge in pedges:
     cntr += 1
     print("\r   {}/{}".format(cntr, lenedg), end="")
     positive_predictions_proba_jcc.append(
         list(nx.jaccard_coefficient(G, [edge]))[0][2])
     positive_predictions_proba_ra.append(
         list(nx.resource_allocation_index(G, [edge]))[0][2])
     positive_predictions_proba_aa.append(
         list(nx.adamic_adar_index(G, [edge]))[0][2])
     positive_predictions_proba_pa.append(
         list(nx.preferential_attachment(G, [edge]))[0][2])
     positive_predictions_proba_cnsh.append(
         list(nx.cn_soundarajan_hopcroft(
             G, [edge]))[0][2])  # needs community information
     positive_predictions_proba_rash.append(
         list(nx.ra_index_soundarajan_hopcroft(
             G, [edge]))[0][2])  # needs community information
     positive_predictions_proba_wic.append(
         list(nx.within_inter_cluster(
             G, [edge]))[0][2])  # needs community information
     positive_predictions_proba_slp_DegCent.append(
         list(SLP_prediction(G, [edge], centrality="DegCent"))[0][2])
     positive_predictions_proba_slp_EigenCent.append(
         list(SLP_prediction(G, [edge], centrality="EigenCent"))[0][2])
Esempio n. 30
0
    return itertools.combinations(iterable, 2)


G = nx.read_edgelist("./data/drugbank_interactions.tsv",
                     delimiter="\t",
                     nodetype=str)

partition = community.best_partition(G)
nx.set_node_attributes(G, name='community', values=partition)

ap = list(all_pairs(G.nodes()))

cn = cn.cnbors(G, ap)
rai = nx.resource_allocation_index(G, ap)
jc = nx.jaccard_coefficient(G, ap)
aai = nx.adamic_adar_index(G, ap)
pa = nx.preferential_attachment(G, ap)
ccn = nx.cn_soundarajan_hopcroft(G, ap)
cra = nx.ra_index_soundarajan_hopcroft(G, ap)
wic = nx.within_inter_cluster(G, ap, community='community')

u, v, s1, s2, s3, s4, s5, s6, s7, s8, has_edge = ([] for i in range(11))
for m1, m2, m3, m4, m5, m6, m7, m8 in zip(cn, rai, jc, aai, pa, ccn, cra, wic):
    u.append(m1[0])
    v.append(m1[1])
    s1.append(m1[2])
    s2.append(m2[2])
    s3.append(m3[2])
    s4.append(m4[2])
    s5.append(m5[2])
    s6.append(m6[2])
def save_to_file_similarities(E1, j):
    node_list = list(E1.nodes)
    temp_short_path, temp_cn, temp_jc, temp_a, temp_pa = {}, {}, {}, {}, {}
    #calculation start
    for node1 in node_list:
        for node2 in node_list:
            temp = nx.jaccard_coefficient(E1, [(node1, node2)])
            for u, v, p in temp:
                temp_jc[u, v] = p

            temp = nx.adamic_adar_index(E1, [(node1, node2)])
            try:
                for u, v, p in temp:
                    temp_a[u, v] = p

            except:
                temp_a[u, v] = 0.0

            temp = nx.preferential_attachment(E1, [(node1, node2)])
            for u, v, p in temp:
                temp_pa[u, v] = p

            temp = sorted(nx.common_neighbors(E1, node1, node2))
            temp_common_neighbors = []
            temp = sorted(nx.common_neighbors(E1, node1, node2))
            for common_neighbor in temp:
                temp_common_neighbors.append(common_neighbor)
            temp_cn[node1, node2] = (len(temp_common_neighbors))

            #Ill use Exception Checking cause we have DiGraph and maybe there is no path between to nodes
            #so an error would compile
            try:
                length = nx.shortest_path_length(E1, node1, node2)
                temp_short_path[node1, node2] = length

            except:
                temp_short_path[node1, node2] = 0.0

    #calculation ends
    temp_cn = OrderedDict(sorted(temp_cn.items(), key=lambda kx: kx[1]))
    temp_short_path = OrderedDict(
        sorted(temp_short_path.items(), key=lambda kx: kx[1]))
    temp_jc = OrderedDict(sorted(temp_jc.items(), key=lambda kx: kx[1]))
    temp_a = OrderedDict(sorted(temp_a.items(), key=lambda kx: kx[1]))
    temp_pa = OrderedDict(sorted(temp_pa.items(), key=lambda kx: kx[1]))
    directory = 'Subgraphs/Similarities/' + 'T' + str(j - 1) + '-' + 'T' + str(
        j + 1) + '_E_' + 'T' + str(j - 1) + '-' + 'T' + str(j)
    if not os.path.exists(directory):
        os.makedirs(directory)

    temp_path = temp_path = directory + '/' + str(E1.graph['name'])

    save_as_csv(temp_path + '_shortest_path', temp_short_path,
                ['Node', 'Length of Shortest Path'])
    save_as_csv(temp_path + '_jaccard_coef', temp_jc,
                ['(Node1, Node2)', 'Jaccard Coefficient'])
    save_as_csv(temp_path + '_pref_attach.txt', temp_pa,
                ['(Node1, Node2)', 'Preferential Attachment'])
    save_as_csv(temp_path + '_adamic_index.txt', temp_a,
                ['(Node1, Node2)', 'Adamic Index'])
    save_as_csv(temp_path + '_common_neigh.txt', temp_cn,
                ['(Node1, Node2)', '# of Common Neighbors'])
Esempio n. 32
0
def aa(G, i, j):
    return sorted(nx.adamic_adar_index(G, [(i, j)]))[0][2]
Esempio n. 33
0
def get_adamic(filepath):
    D, pr, pr_df = get_pagerank(filepath, save_file=False)
    H = D.to_undirected()
    adm = nx.adamic_adar_index(H, ebunch=pr_df["relation"])
    # adm_output=[(item[0],item[1],item[2]) for item in adm ]
    pass
    parts_idx = np.where(parts == i)[0]
    parts_graph = train_G.subgraph(parts_idx)
    # find top-1% nodes
    landmarks = []
    for node, degree in sorted(parts_graph.degree().items(), key=lambda item: item[1], reverse=True):
        landmarks.append(node)
        if len(landmarks) > parts_graph.number_of_nodes() * 0.01:
            landmarks_in_train_G.extend(landmarks)
            break
    # save partial graph with original label
    mapping = {n: parts_graph.node[n]['ori_label'] for n in parts_graph.nodes()}
    parts_graph = nx.relabel_nodes(parts_graph, mapping, copy=True)
    for (n, data) in parts_graph.nodes(data=True):
        data.pop('ori_label', None)
    with open(Path('clustering', dataset, '{}.gpickle'.format(i)), 'wb') as f:
        nx.write_gpickle(parts_graph, f)

combinations = list(itertools.combinations(landmarks_in_train_G, 2))
combinations = set(tuple(sorted(item)) for item in combinations)

landmark_graph = nx.Graph()
for u, v, p in nx.adamic_adar_index(train_G, combinations):
    ori_u = train_G.node[u]['ori_label']
    ori_v = train_G.node[v]['ori_label']
    edge_w = 2./(1 + np.exp(-p/10)) - 1.
    landmark_graph.add_edge(ori_u, ori_v, attr_dict={'weight': edge_w})
    landmark_graph.add_edge(ori_v, ori_u, attr_dict={'weight': edge_w})

with open(Path('clustering', dataset, 'landmark.gpickle'), 'wb') as f:
    nx.write_gpickle(landmark_graph, f)
Esempio n. 35
0
 def adamicAdar(edges: np.array, output_file: str):
     # Initialize graph.
     graph = nx.read_edgelist("out_graph.txt", nodetype=int, create_using=nx.Graph())
     preds = nx.adamic_adar_index(graph, edges)
     RecommendationPolicies.writeNpToFile(output_file, preds)
Esempio n. 36
0
def get_features(L, flag):
    X = [[] for i in range(len(L))]

    #=====================Social features(user-to-user graph)======================

    #g0.adamic adar score
    if flag['g0'] is True:
        print("get feature g0")
        preds = nx.adamic_adar_index(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g1.jaccard coefficient
    if flag['g1'] is True:
        print("get feature g1")
        preds = nx.jaccard_coefficient(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1
    #g2.resource_allocation
    if flag['g2'] is True:
        print("get feature g2")
        preds = nx.resource_allocation_index(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g3.preferentail_attachment
    if flag['g3'] is True:
        print("get feature g3")
        preds = nx.preferential_attachment(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g4.shortest path length
    if flag['g4'] is True:
        print("get feature g4")
        cnt = 0
        for (u, v) in L:
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                if nx.has_path(G, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(G, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
                G.add_edge(u, v)
            else:
                if nx.has_path(G, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(G, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
            cnt += 1

    #g5.common neighbors
    if flag['g5'] is True:
        print("get feature g5")
        cnt = 0
        for (u, v) in L:
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                T = [w for w in nx.common_neighbors(G, u, v)]
                G.add_edge(u, v)
            else:
                T = [w for w in nx.common_neighbors(G, u, v)]
            X[cnt].append(len(T))
            cnt += 1

    #g6.Approximate katz for social graph
    if flag['g6'] is True:
        print("get feature g6")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for x in G.neighbors(u):
                    for y in G.neighbors(v):
                        if x == y or G.has_edge(x, y):
                            p += 1
                G.add_edge(u, v)
            else:
                for x in G.neighbors(u):
                    for y in G.neighbors(v):
                        if x == y or G.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #=========================checkin features=========================================
    #c0.follower number
    if flag['c0'] is True:
        print("get feature c0")
        cnt = 0
        for (u, v) in L:
            X[cnt].append(U[u]['follow_cnt'] * U[v]['follow_cnt'])  # fu*fv
            cnt += 1

    #c1.same time same location
    if flag['c1'] is True:
        print("get feature c1")
        cnt = 0
        for (u, v) in L:
            p = calculate_CCC(G, u, v)
            X[cnt].append(p)
            cnt += 1

    #c2.same time same distinct spot
    if flag['c2'] is True:
        print("get deature c2")
        cnt = 0
        for (u, v) in L:
            p = 0
            dis_same_spot = []
            for k in C[u]:
                if k[1] not in dis_same_spot and k in C[v]:
                    dis_same_spot.append(k[1])
                    p += 1
            X[cnt].append(p)
            cnt += 1

    #c3.same distinct spot (not necessarily same time)
    if flag['c3'] is True:
        cnt = 0
        print("get feature c3")
        for (u, v) in L:
            p = 0
            dis_same_spot = []
            for k in C[u]:
                if k[1] not in dis_same_spot:
                    for m in C[v]:
                        if k[1] == m[1]:
                            dis_same_spot.append(k[1])
                            p += 1
                            break
            X[cnt].append(p)
            cnt += 1

    #c4.min Entropy
    if flag['c4'] is True:
        print("get feature c4")
        cnt = 0
        for (u, v) in L:
            p = 0
            E_list = []
            for k in C[u]:
                if k in C[v]:
                    spot = k[1]
                    if spot in S and S[spot]['entropy'] > 0:
                        E_list.append(S[spot]['entropy'])
            if len(E_list) > 0:
                p = min(E_list)
            X[cnt].append(p)
            cnt += 1

    #c5. distance of mean_LL
    if flag['c5'] is True:
        cnt = 0
        print("get feature c5")
        for (u, v) in L:
            dist = np.sqrt((U[u]['mean_LL'][0] - U[v]['mean_LL'][0])**2 +
                           (U[u]['mean_LL'][1] - U[v]['mean_LL'][1])**2)
            X[cnt].append(dist)
            cnt += 1

    #c6.weighted same location
    if flag['c6'] is True:
        print("get feature c6")
        cnt = 0
        for (u, v) in L:
            p = 0
            for k in C[u]:
                if k in C[v]:
                    spot = k[1]
                    #if spot in S and S[spot]['entropy'] > 0:
                    #p += 1/S[spot]['entropy']
                    if spot in S:
                        dist = np.sqrt(
                            (S[spot]['LL'][0] - U[u]['mean_LL'][0])**2 +
                            (S[spot]['LL'][1] - U[u]['mean_LL'][1])**2)
                        p += dist
                        dist = np.sqrt(
                            (S[spot]['LL'][0] - U[v]['mean_LL'][0])**2 +
                            (S[spot]['LL'][1] - U[v]['mean_LL'][1])**2)
                        p += dist
            X[cnt].append(p)
            cnt += 1

    #c7.PP
    if flag['c7'] is True:
        print("get feature c7")
        cnt = 0
        for (u, v) in L:
            p = len(C[u]) * len(C[v])
            X[cnt].append(p)
            cnt += 1

    #c8.Total Common Friend Closeness (TCFC)
    if flag['c8'] is True:
        print("get feature c8")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for w in nx.common_neighbors(G, u, v):
                    T1 = [x for x in nx.common_neighbors(G, u, w)]
                    T2 = [x for x in nx.common_neighbors(G, v, w)]
                    p += len(T1) * len(T2)
                G.add_edge(u, v)
            else:
                for w in nx.common_neighbors(G, u, v):
                    T1 = [x for x in nx.common_neighbors(G, u, w)]
                    T2 = [x for x in nx.common_neighbors(G, v, w)]
                    p += len(T1) * len(T2)
            X[cnt].append(p)
            cnt += 1

    #c9.Total Common friend Checkin Count (TCFCC)
    if flag['c9'] is True:
        print("get feature c9")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for w in nx.common_neighbors(G, u, v):
                    p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w)
                G.add_edge(u, v)
            else:
                for w in nx.common_neighbors(G, u, v):
                    p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w)
            X[cnt].append(p)
            cnt += 1

    #c10. Common Category Checkin Counts Product (CCCP)
    if flag['c10'] is True:
        print("get feature c10")
        cnt = 0
        for (u, v) in L:
            p = 0
            for cat in U[u]['cate']:
                if cat in U[v]['cate']:
                    p += U[u]['cate'][cat] * U[v]['cate'][cat]
            X[cnt].append(p)
            cnt += 1

    #c11. Common Category Checkin Counts Product Ratio(CCCPR)
    if flag['c11'] is True:
        print("get feature c11")
        cnt = 0
        for (u, v) in L:
            p = 0
            u_cate_total = sum(U[u]['cate'][cat]**2 for cat in U[u]['cate'])
            v_cate_total = sum(U[v]['cate'][cat]**2 for cat in U[v]['cate'])
            for cat in U[u]['cate']:
                if cat in U[v]['cate']:
                    p += (U[u]['cate'][cat] * U[v]['cate'][cat] /
                          np.sqrt(u_cate_total * v_cate_total))
            X[cnt].append(p)
            cnt += 1

#c12.trip route length all
    if flag['c12'] is True:
        print("get feature c12")
        cnt = 0
        for (u, v) in L:
            tripDayLen1 = list()
            tripDayLen2 = list()
            tripDay = "starting"
            tripLen = 0.0
            lastSpot = [0.0, 0.0]
            for k in C[u]:
                if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0):
                    if k[1] in S:
                        tripLen += np.sqrt((lastSpot[0] -
                                            S[k[1]]['LL'][0])**2 +
                                           (lastSpot[1] - S[k[1]]['LL'][1])**2)
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
                else:
                    if k[1] in S:
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
            tripDay = "starting"
            tripLen2 = 0.0
            lastSpot = [0.0, 0.0]
            for k in C[v]:
                if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0):
                    if k[1] in S:
                        tripLen2 += np.sqrt(
                            (lastSpot[0] - S[k[1]]['LL'][0])**2 +
                            (lastSpot[1] - S[k[1]]['LL'][1])**2)
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
                else:
                    if k[1] in S:
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
            X[cnt].append(tripLen + tripLen2)
            cnt += 1

    #=========================Heter Graph features=====================================

    #h0.Approximate katz for bipartite graph
    if flag['h0'] is True:
        print("get feature h0")
        cnt = 0
        for (u, v) in L:
            p = 0
            for x in B.neighbors(u):
                for y in B.neighbors(v):
                    if x == y or B.has_edge(x, y):
                        p += 1
            X[cnt].append(p)
            cnt += 1

    #h1.Approximate katz on HB
    if flag['h1'] is True:
        print("get feature h1")
        cnt = 0
        for (u, v) in L:
            p = 0
            if HB.has_edge(u, v):
                HB.remove_edge(u, v)
                for x in HB.neighbors(u):
                    for y in HB.neighbors(v):
                        if x == y or HB.has_edge(x, y):
                            p += 1
                HB.add_edge(u, v)
            else:
                for x in HB.neighbors(u):
                    for y in HB.neighbors(v):
                        if x == y or HB.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #h2.Approximate katz on H
    if flag['h2'] is True:
        print("get feature h2")
        cnt = 0
        for (u, v) in L:
            p = 0
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
                H.add_edge(u, v)
            else:
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #h3.shortest path length on B
    if flag['h3'] is True:
        print("get feature h3")
        cnt = 0
        for (u, v) in L:
            if nx.has_path(B, u, v):
                X[cnt].append(
                    nx.shortest_path_length(B, source=u, target=v) / 50000)
            else:
                X[cnt].append(1)
            cnt += 1

    #h4.clustering coefiicient on H
    if flag['h4'] is True:
        print("get feature h4")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                p = nx.clustering(H, u) * nx.clustering(H, v)
                H.add_edge(u, v)
            else:
                p = nx.clustering(H, u) * nx.clustering(H, v)
            X[cnt].append(p)
            cnt += 1

    #h5. number of (user's loc friends)'s loc friends
    if flag['h5'] is True:
        print("get feature h5")
        cnt = 0
        for (u, v) in L:
            counter1 = 0
            for neighbor in H.neighbors(u):
                if not neighbor.isnumeric():
                    for neighbor2 in H.neighbors(neighbor):
                        if not neighbor.isnumeric():
                            counter1 += 1
            counter2 = 0
            for neighbor in H.neighbors(v):
                if not neighbor.isnumeric():
                    for neighbor2 in H.neighbors(neighbor):
                        if not neighbor.isnumeric():
                            counter2 += 1

            #print(str(counter1)+" "+str(counter2)+"\n")
            X[cnt].append(counter1 * counter2)
            cnt += 1
    return X
jaccard = np.zeros(n)
adar = np.zeros(n)
preferential_attachment = np.zeros(n)
resource_allocation_index = np.zeros(n)
common_neighbors = np.zeros(n)

# computing features for training set
for i in tqdm(range(len(id1))):
    if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
        G.remove_edge(id1[i], id2[i])

    pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])])
    pred = [(u, v, p) for (u, v, p) in pred]
    jaccard[i] = pred[0][2]

    pred = nx.adamic_adar_index(G, [(id1[i], id2[i])])
    pred = [(u, v, p) for (u, v, p) in pred]
    adar[i] = pred[0][2]

    pred = nx.preferential_attachment(G, [(id1[i], id2[i])])
    pred = [(u, v, p) for (u, v, p) in pred]
    preferential_attachment[i] = pred[0][2]

    pred = nx.resource_allocation_index(G, [(id1[i], id2[i])])
    pred = [(u, v, p) for (u, v, p) in pred]
    resource_allocation_index[i] = pred[0][2]

    pred = nx.common_neighbors(G, id1[i], id2[i])
    pred = len([u for u in pred])
    common_neighbors[i] = pred
def adamic_adar(graph, output_file_name): 
  outputFile = open(output_file_name + "_adamic_adar", 'w')
  for (u, v, score) in nx.adamic_adar_index(graph, graph.edges()): 
    line = outputFormat(u, v, score)
    outputFile.write(line)
  outputFile.close()
Esempio n. 39
0
# Common Neighbors
CN = [(e[0], e[1], len(list(nx.common_neighbors(M, e[0], e[1]))))
      for e in nx.non_edges(M)]
CN.sort(key=operator.itemgetter(2), reverse=True)

# Jaccard coef
jaccard = list(nx.jaccard_coefficient(M))
jaccard.sort(key=operator.itemgetter(2), reverse=True)

# Resource Allocation index
RA = list(nx.resource_allocation_index(M))
RA.sort(key=operator.itemgetter(2), reverse=True)

# Adamic-Adar index
AA = list(nx.adamic_adar_index(M))
AA.sort(key=operator.itemgetter(2), reverse=True)

# Preferential Attachement
PA = list(nx.preferential_attachment(M))
PA.sort(key=operator.itemgetter(2), reverse=True)

# Community Common Neighbors !!! requires graph to have node attribute: 'community' !!!
#CCN = list(nx.cn_soundarajan_hopcroft(M))
#CCN.sort(key=operator.itemgetter(2), reverse = True)

# Community Resource Allocation !!! requires graph to have node attribute: 'community' !!!
#CRA = list(nx.ra_index_soundarajan_hopcroft(M))
#CRA.sort(key=operator.itemgetter(2), reverse = True)

# ###################### Prediction on Future Edge Linkage ####################
Esempio n. 40
0
nx.draw_networkx(graph_1)

# In[18]:

plt.rcParams["figure.figsize"] = (20, 15)
nx.draw_networkx(graph_2,
                 node_size=np.array(list(nx.pagerank(graph_2, .5).values())) *
                 10**5)

# ## Prédiction de liens
# Évaluez la similarité, au sens de l'indice Adamic/Adar, entre toutes les paires de noeuds non adjacents du graphe non orienté `graph_1`

# In[19]:
print('\n[+] Exercice 8')

print(top_k_triplets(nx.adamic_adar_index(graph_1), 5))

# Implémentez la fonction `generic_common_neighbors` utilisée par `generic_adamic_adar`.

# In[20]:


def generic_common_neighbors(g, u, v):
    """
    Intersection of u's neighbors and v's neighbors
    :g: networkx graph
    :u, v: str, nodes
    :return: list, of common neighbors
    """
    common = set(g.neighbors(u)).intersection(g.neighbors(v))
    return list(common)
Esempio n. 41
0
def gen_topol_feats(A_orig, A, edge_s):
    """ 
    This function generates the topological features for matrix A (A_tr or A_ho) over edge samples edge_s (edge_tr or edge_ho).

    Input and Parameters:
    -------
    A: the training or holdout adjacency matrix that the topological features are going to be computed over
    A_orig: the original adjacency matrix
    edge_s: the sample set of training or holdout edges that the topological features are going to be computed over

    Returns:
    -------
    df_feat: data frame of features

    Examples:
    -------
    >>> gen_topol_feats(A_orig, A_tr, edge_tr)
    >>> gen_topol_feats(A_orig, A_ho, edge_ho)
    """

    _, edges = adj_to_nodes_edges(A)
    nodes = [int(iii) for iii in range(A.shape[0])]
    N = len(nodes)
    if len(edges.shape) == 1:
        edges = [(int(iii), int(jjj)) for iii, jjj in [edges]]
    else:
        edges = [(int(iii), int(jjj)) for iii, jjj in edges]

    # define graph
    G = nx.Graph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)

    # average degree (AD)
    ave_deg_net = np.sum(A) / A.shape[0]
    # variance of degree distribution (VD)
    var_deg_net = np.sqrt(
        np.sum(np.square(np.sum(A, axis=0) - ave_deg_net)) / (A.shape[0] - 1))
    # average (local) clustering coefficient (ACC)
    ave_clust_net = nx.average_clustering(G)

    # samples chosen - features
    edge_pairs_f_i = edge_s[:, 0]
    edge_pairs_f_j = edge_s[:, 1]

    # local number of triangles for i and j (LNT_i, LNT_j)
    numtriang_nodes_obj = nx.triangles(G)
    numtriang_nodes = []
    for nn in range(len(nodes)):
        numtriang_nodes.append(numtriang_nodes_obj[nn])

    numtriang1_edges = []
    numtriang2_edges = []
    for ee in range(len(edge_s)):
        numtriang1_edges.append(numtriang_nodes[edge_s[ee][0]])
        numtriang2_edges.append(numtriang_nodes[edge_s[ee][1]])

    # Page rank values for i and j (PR_i, PR_j)
    page_rank_nodes_obj = nx.pagerank(G)
    page_rank_nodes = []
    for nn in range(len(nodes)):
        page_rank_nodes.append(page_rank_nodes_obj[nn])

    page_rank1_edges = []
    page_rank2_edges = []
    for ee in range(len(edge_s)):
        page_rank1_edges.append(page_rank_nodes[edge_s[ee][0]])
        page_rank2_edges.append(page_rank_nodes[edge_s[ee][1]])

    # j-th entry of the personalized page rank of node i (PPR)
    page_rank_pers_nodes = []
    hot_vec = {}
    for nn in range(len(nodes)):
        hot_vec[nn] = 0
    for nn in range(len(nodes)):
        hot_vec_copy = hot_vec.copy()
        hot_vec_copy[nn] = 1
        page_rank_pers_nodes.append(
            nx.pagerank(G, personalization=hot_vec_copy))

    page_rank_pers_edges = []
    for ee in range(len(edge_s)):
        page_rank_pers_edges.append(
            page_rank_pers_nodes[edge_s[ee][0]][edge_s[ee][1]])

    # local clustering coefficients for i and j (LCC_i, LCC_j)
    clust_nodes_obj = nx.clustering(G)
    clust_nodes = []
    for nn in range(len(nodes)):
        clust_nodes.append(clust_nodes_obj[nn])

    clust1_edges = []
    clust2_edges = []
    for ee in range(len(edge_s)):
        clust1_edges.append(clust_nodes[edge_s[ee][0]])
        clust2_edges.append(clust_nodes[edge_s[ee][1]])

    # average neighbor degrees for i and j (AND_i, AND_j)
    ave_neigh_deg_nodes_obj = nx.average_neighbor_degree(G)
    ave_neigh_deg_nodes = []
    for nn in range(len(nodes)):
        ave_neigh_deg_nodes.append(ave_neigh_deg_nodes_obj[nn])

    ave_neigh_deg1_edges = []
    ave_neigh_deg2_edges = []
    for ee in range(len(edge_s)):
        ave_neigh_deg1_edges.append(ave_neigh_deg_nodes[edge_s[ee][0]])
        ave_neigh_deg2_edges.append(ave_neigh_deg_nodes[edge_s[ee][1]])

    # degree centralities for i and j (DC_i, DC_j)
    deg_cent_nodes_obj = nx.degree_centrality(G)
    deg_cent_nodes = []
    for nn in range(len(nodes)):
        deg_cent_nodes.append(deg_cent_nodes_obj[nn])

    deg_cent1_edges = []
    deg_cent2_edges = []
    for ee in range(len(edge_s)):
        deg_cent1_edges.append(deg_cent_nodes[edge_s[ee][0]])
        deg_cent2_edges.append(deg_cent_nodes[edge_s[ee][1]])

# eigenvector centralities for i and j (EC_i, EC_j)
    tr = 1
    toler = 1e-6
    while tr == 1:
        try:
            eig_cent_nodes_obj = nx.eigenvector_centrality(G, tol=toler)
            tr = 0
        except:
            toler = toler * 1e1

    eig_cent_nodes = []
    for nn in range(len(nodes)):
        eig_cent_nodes.append(eig_cent_nodes_obj[nn])

    eig_cent1_edges = []
    eig_cent2_edges = []
    for ee in range(len(edge_s)):
        eig_cent1_edges.append(eig_cent_nodes[edge_s[ee][0]])
        eig_cent2_edges.append(eig_cent_nodes[edge_s[ee][1]])

    # Katz centralities for i and j (KC_i, KC_j)
    ktz_cent_nodes_obj = nx.katz_centrality_numpy(G)
    ktz_cent_nodes = []
    for nn in range(len(nodes)):
        ktz_cent_nodes.append(ktz_cent_nodes_obj[nn])

    ktz_cent1_edges = []
    ktz_cent2_edges = []
    for ee in range(len(edge_s)):
        ktz_cent1_edges.append(ktz_cent_nodes[edge_s[ee][0]])
        ktz_cent2_edges.append(ktz_cent_nodes[edge_s[ee][1]])

    # Jaccard’s coefficient of neighbor sets of i, j (JC)
    jacc_coeff_obj = nx.jaccard_coefficient(G, edge_s)
    jacc_coeff_edges = []
    for uu, vv, jj in jacc_coeff_obj:
        jacc_coeff_edges.append([uu, vv, jj])
    df_jacc_coeff = pd.DataFrame(jacc_coeff_edges,
                                 columns=['i', 'j', 'jacc_coeff'])
    df_jacc_coeff['ind'] = df_jacc_coeff.index

    # resource allocation index of i, j (RA)
    res_alloc_ind_obj = nx.resource_allocation_index(G, edge_s)
    res_alloc_ind_edges = []
    for uu, vv, jj in res_alloc_ind_obj:
        res_alloc_ind_edges.append([uu, vv, jj])
    df_res_alloc_ind = pd.DataFrame(res_alloc_ind_edges,
                                    columns=['i', 'j', 'res_alloc_ind'])
    df_res_alloc_ind['ind'] = df_res_alloc_ind.index

    # Adamic/Adar index of i, j (AA)
    adam_adar_obj = nx.adamic_adar_index(G, edge_s)
    adam_adar_edges = []
    for uu, vv, jj in adam_adar_obj:
        adam_adar_edges.append([uu, vv, jj])
    df_adam_adar = pd.DataFrame(adam_adar_edges,
                                columns=['i', 'j', 'adam_adar'])
    df_adam_adar['ind'] = df_adam_adar.index

    df_merge = pd.merge(df_jacc_coeff,
                        df_res_alloc_ind,
                        on=['ind', 'i', 'j'],
                        sort=False)
    df_merge = pd.merge(df_merge,
                        df_adam_adar,
                        on=['ind', 'i', 'j'],
                        sort=False)

    # preferential attachment (degree product) of i, j (PA)
    pref_attach_obj = nx.preferential_attachment(G, edge_s)
    pref_attach_edges = []
    for uu, vv, jj in pref_attach_obj:
        pref_attach_edges.append([uu, vv, jj])
    df_pref_attach = pd.DataFrame(pref_attach_edges,
                                  columns=['i', 'j', 'pref_attach'])
    df_pref_attach['ind'] = df_pref_attach.index

    # global features:
    # similarity of connections in the graph with respect to the node degree
    # degree assortativity (DA)
    deg_ass_net = nx.degree_assortativity_coefficient(G)
    # transitivity: fraction of all possible triangles present in G
    # network transitivity (clustering coefficient) (NT)
    transit_net = nx.transitivity(G)
    # network diameter (ND)
    try:
        diam_net = nx.diameter(G)
    except:
        diam_net = np.inf

    ave_deg_net = [ave_deg_net for ii in range(10000)]
    var_deg_net = [var_deg_net for ii in range(10000)]
    ave_clust_net = [ave_clust_net for ii in range(10000)]
    deg_ass_net = [deg_ass_net for ii in range(10000)]
    transit_net = [transit_net for ii in range(10000)]
    diam_net = [diam_net for ii in range(10000)]
    com_ne = []
    for ee in range(len(edge_s)):
        com_ne.append(
            len(sorted(nx.common_neighbors(G, edge_s[ee][0], edge_s[ee][1]))))

    # closeness centralities for i and j (CC_i, CC_j)
    closn_cent_nodes_obj = nx.closeness_centrality(G)
    closn_cent_nodes = []
    for nn in range(len(nodes)):
        closn_cent_nodes.append(closn_cent_nodes_obj[nn])

    closn_cent1_edges = []
    closn_cent2_edges = []
    for ee in range(len(edge_s)):
        closn_cent1_edges.append(closn_cent_nodes[edge_s[ee][0]])
        closn_cent2_edges.append(closn_cent_nodes[edge_s[ee][1]])

    # shortest path between i, j (SP)
    short_Mat_aux = nx.shortest_path_length(G)
    short_Mat = {}
    for ss in range(N):
        value = next(short_Mat_aux)
        short_Mat[value[0]] = value[1]
    short_path_edges = []
    for ee in range(len(edge_s)):
        if edge_s[ee][1] in short_Mat[edge_s[ee][0]].keys():
            short_path_edges.append(short_Mat[edge_s[ee][0]][edge_s[ee][1]])
        else:
            short_path_edges.append(np.inf)

    # load centralities for i and j (LC_i, LC_j)
    load_cent_nodes_obj = nx.load_centrality(G, normalized=True)
    load_cent_nodes = []
    for nn in range(len(nodes)):
        load_cent_nodes.append(load_cent_nodes_obj[nn])

    load_cent1_edges = []
    load_cent2_edges = []
    for ee in range(len(edge_s)):
        load_cent1_edges.append(load_cent_nodes[edge_s[ee][0]])
        load_cent2_edges.append(load_cent_nodes[edge_s[ee][1]])

    # shortest-path betweenness centralities for i and j (SPBC_i, SPBC_j)
    betw_cent_nodes_obj = nx.betweenness_centrality(G, normalized=True)
    betw_cent_nodes = []
    for nn in range(len(nodes)):
        betw_cent_nodes.append(betw_cent_nodes_obj[nn])

    betw_cent1_edges = []
    betw_cent2_edges = []
    for ee in range(len(edge_s)):
        betw_cent1_edges.append(betw_cent_nodes[edge_s[ee][0]])
        betw_cent2_edges.append(betw_cent_nodes[edge_s[ee][1]])

    neigh_ = {}
    for nn in range(len(nodes)):
        neigh_[nn] = np.where(A[nn, :])[0]

    df_pref_attach = []
    for ee in range(len(edge_s)):
        df_pref_attach.append(
            len(neigh_[edge_s[ee][0]]) * len(neigh_[edge_s[ee][1]]))

    U, sig, V = np.linalg.svd(A, full_matrices=False)
    S = np.diag(sig)
    Atilda = np.dot(U, np.dot(S, V))
    Atilda = np.array(Atilda)

    f_mean = lambda x: np.mean(x) if len(x) > 0 else 0
    # entry i, j in low rank approximation (LRA) via singular value decomposition (SVD)
    svd_edges = []
    # dot product of columns i and j in LRA via SVD for each pair of nodes i, j
    svd_edges_dot = []
    # average of entries i and j’s neighbors in low rank approximation
    svd_edges_mean = []
    for ee in range(len(edge_s)):
        svd_edges.append(Atilda[edge_s[ee][0], edge_s[ee][1]])
        svd_edges_dot.append(
            np.inner(Atilda[edge_s[ee][0], :], Atilda[:, edge_s[ee][1]]))
        svd_edges_mean.append(
            f_mean(Atilda[edge_s[ee][0], neigh_[edge_s[ee][1]]]))

    # Leicht-Holme-Newman index of neighbor sets of i, j (LHN)
    f_LHN = lambda num, den: 0 if (num == 0 and den == 0) else float(num) / den
    LHN_edges = [
        f_LHN(num, den)
        for num, den in zip(np.array(com_ne), np.array(df_pref_attach))
    ]

    U, sig, V = np.linalg.svd(A)
    S = linalg.diagsvd(sig, A.shape[0], A.shape[1])
    S_trunc = S.copy()
    S_trunc[S_trunc < sig[int(np.ceil(np.sqrt(A.shape[0])))]] = 0
    Atilda = np.dot(np.dot(U, S_trunc), V)
    Atilda = np.array(Atilda)

    f_mean = lambda x: np.mean(x) if len(x) > 0 else 0
    # an approximation of LRA (LRA-approx)
    svd_edges_approx = []
    # an approximation of dLRA (dLRA-approx)
    svd_edges_dot_approx = []
    # an approximation of mLRA (mLRA-approx)
    svd_edges_mean_approx = []
    for ee in range(len(edge_s)):
        svd_edges_approx.append(Atilda[edge_s[ee][0], edge_s[ee][1]])
        svd_edges_dot_approx.append(
            np.inner(Atilda[edge_s[ee][0], :], Atilda[:, edge_s[ee][1]]))
        svd_edges_mean_approx.append(
            f_mean(Atilda[edge_s[ee][0], neigh_[edge_s[ee][1]]]))

    # number of nodes (N)
    num_nodes = A_orig.shape[0]
    # number of observed edges (OE)
    num_edges = int(np.sum(A) / 2)

    # construct a dictionary of the features
    d = {'i':edge_pairs_f_i, 'j':edge_pairs_f_j, 'com_ne':com_ne, 'ave_deg_net':ave_deg_net, \
         'var_deg_net':var_deg_net, 'ave_clust_net':ave_clust_net, 'num_triangles_1':numtriang1_edges, 'num_triangles_2':numtriang2_edges, \
         'page_rank_pers_edges':page_rank_pers_edges, 'pag_rank1':page_rank1_edges, 'pag_rank2':page_rank2_edges, 'clust_coeff1':clust1_edges, 'clust_coeff2':clust2_edges, 'ave_neigh_deg1':ave_neigh_deg1_edges, 'ave_neigh_deg2':ave_neigh_deg2_edges,\
         'eig_cent1':eig_cent1_edges, 'eig_cent2':eig_cent2_edges, 'deg_cent1':deg_cent1_edges, 'deg_cent2':deg_cent2_edges, 'clos_cent1':closn_cent1_edges, 'clos_cent2':closn_cent2_edges, 'betw_cent1':betw_cent1_edges, 'betw_cent2':betw_cent2_edges, \
         'load_cent1':load_cent1_edges, 'load_cent2':load_cent2_edges, 'ktz_cent1':ktz_cent1_edges, 'ktz_cent2':ktz_cent2_edges, 'pref_attach':df_pref_attach, 'LHN':LHN_edges, 'svd_edges':svd_edges,'svd_edges_dot':svd_edges_dot,'svd_edges_mean':svd_edges_mean,\
         'svd_edges_approx':svd_edges_approx,'svd_edges_dot_approx':svd_edges_dot_approx,'svd_edges_mean_approx':svd_edges_mean_approx, 'short_path':short_path_edges, 'deg_assort':deg_ass_net, 'transit_net':transit_net, 'diam_net':diam_net, \
         'num_nodes':num_nodes, 'num_edges':num_edges}

    # construct a dataframe of the features
    df_feat = pd.DataFrame(data=d)
    df_feat['ind'] = df_feat.index
    df_feat = pd.merge(df_feat, df_merge, on=['ind', 'i', 'j'], sort=False)
    return df_feat
Esempio n. 42
0
def sort_edges_by_adamic_adar_index(graph, edges):
    edges_sorted = sorted(list(nx.adamic_adar_index(graph, edges)), key=lambda l: l[
                          2], reverse=True, cmp=compare_with_ties)
    return [(row[0], row[1]) for row in edges_sorted], [row[2] for row in edges_sorted]