Esempio n. 1
0
def link_pred(G, seeds, fname):
	print str(fname) + '.txt'
	expansion = []
	for node in G.node:
		G.node[node]['community'] = 1

	for seed in seeds:
		if seed in G.node.keys(): 
			G.node[seed]['community'] = 0

	count = 0
	cutoff = 10
	subgraph = G.subgraph(seeds)

	#seeds = list(set(subgraph.node) - set(seeds))


	preds = nx.cn_soundarajan_hopcroft(G, subgraph.edges())

	for u, v, p in preds:
		if p > cutoff:
			cutoff = p 

	print 'cutoff:{0}'.format(cutoff)
	nodes = []

	for seed in seeds:
		nodes += G.neighbors(seed)

	node = set()
	candiate_nodes = list(set(G.subgraph(nodes).edges()) - set(subgraph.edges()))
	preds = nx.cn_soundarajan_hopcroft(G, candiate_nodes)
	for u, v, p in preds:
		if p <  cutoff*8:
			continue
		expansion.append((u, v))
		node.add(u)
		node.add(v)

	#nx.write_edgelist(subgraph, "result/origingraph/{0}.edgelist".format(fname))
	origing_graph = open("result/origingraph/{0}.edgelist".format(fname), 'w')
	for u, v in subgraph.edges():
		origing_graph.write("{0}\t{1}\n".format(u, v))
	origing_graph.close()

	node_file = open('result/node/{0}.txt'.format(fname), 'w')
	for item in node:
		node_file.write('{0}\n'.format(item))
	node_file.close()

	edge_file = open('result/expanedge/{0}.txt'.format(fname), 'w')
	for u, v in expansion:
		edge_file.write('{0}\t{1}\n'.format(u, v))
	edge_file.close()
	
	print len(node)
def new_connections_predictions():
    pref_attach = list(nx.preferential_attachment(G))
    df = pd.DataFrame(index=[(x[0], x[1]) for x in pref_attach])
    df['pref_attch'] = [x[2] for x in pref_attach]

    common_neigh = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1]))))
                    for e in nx.non_edges(G)]
    df1 = pd.DataFrame(index=[(x[0], x[1]) for x in common_neigh])
    df1['common_neigh'] = [x[2] for x in common_neigh]
    df = df.join(df1, how='outer')
    df['common_neigh'] = df['common_neigh'].fillna(value=0)
    del df1

    community_common_neigh = list(
        nx.cn_soundarajan_hopcroft(G, community='Department'))
    df1 = pd.DataFrame(index=[(x[0], x[1]) for x in community_common_neigh])
    df1['community_common_neigh'] = [x[2] for x in community_common_neigh]
    df = df.join(df1, how='outer')
    df['community_common_neigh'] = df['community_common_neigh'].fillna(value=0)
    del df1

    community_res_alloc = list(
        nx.ra_index_soundarajan_hopcroft(G, community='Department'))
    df1 = pd.DataFrame(index=[(x[0], x[1]) for x in community_res_alloc])
    df1['community_res_alloc'] = [x[2] for x in community_res_alloc]
    df = df.join(df1, how='outer')
    df['community_res_alloc'] = df['community_res_alloc'].fillna(value=0)
    del df1

    df['res_alloc'] = [x[2] for x in list(nx.resource_allocation_index(G))]
    df['jaccard_coeff'] = [x[2] for x in list(nx.jaccard_coefficient(G))]

    features = [
        'jaccard_coeff', 'res_alloc', 'pref_attch', 'common_neigh',
        'community_common_neigh', 'community_res_alloc'
    ]

    df = future_connections.join(df, how='outer')
    df_train = df[~pd.isnull(df['Future Connection'])]
    df_test = df[pd.isnull(df['Future Connection'])]

    X_train = df_train[features]
    X_test = df_test[features]
    y_train = df_train['Future Connection']

    scalar = MinMaxScaler()
    X_train_scaled = scalar.fit_transform(X_train)
    X_test_scaled = scalar.fit_transform(X_test)

    clf = RandomForestClassifier(n_estimators=100,
                                 n_jobs=-1,
                                 max_depth=10,
                                 random_state=0).fit(X_train_scaled, y_train)
    test_proba = clf.predict_proba(X_test_scaled)[:, 1]
    predictions = pd.Series(test_proba, X_test.index)
    # target = future_connections[pd.isnull(future_connections['Future Connection'])]
    # target['proba'] = [predictions[x] for x in target.index]
    return predictions
Esempio n. 3
0
def new_connections_predictions():

    # Your Code Here
    for n in G.nodes():
        G.node[n]['community'] = G.node[n]['Department']
    #df = pd.DataFrame(index=[(x[0], x[1]) for x in list(nx.preferential_attachment(G))])
    future_connections['common_neighbors'] = [
        len(list(nx.common_neighbors(G, x[0], x[1])))
        for x in future_connections.index
    ]
    future_connections['jaccard_coefficient'] = [
        list(nx.jaccard_coefficient(G, [x]))[0][2]
        for x in future_connections.index
    ]
    future_connections['resource_allocation_index'] = [
        list(nx.resource_allocation_index(G, [x]))[0][2]
        for x in future_connections.index
    ]
    future_connections['adamic_adar_index'] = [
        list(nx.adamic_adar_index(G, [x]))[0][2]
        for x in future_connections.index
    ]
    future_connections['preferential_attachment'] = [
        list(nx.preferential_attachment(G, [x]))[0][2]
        for x in future_connections.index
    ]
    future_connections['cn_soundarajan_hopcroft'] = [
        list(nx.cn_soundarajan_hopcroft(G, [x]))[0][2]
        for x in future_connections.index
    ]
    #future_connections['ra_soundarajan_hopcroft'] = [list(nx.ra_soundarajan_hopcroft(G, [x]))[0][2] for x in future_connections.index]
    future_connections['cn_soundarajan_hopcroft'] = future_connections[
        'cn_soundarajan_hopcroft'].fillna(value=0)
    #future_connections['ra_soundarajan_hopcroft'] = df['cn_soundarajan_hopcroft'].fillna(value=0)
    #future_connections.join(df,how='outer')

    features = [
        'jaccard_coefficient', 'resource_allocation_index',
        'adamic_adar_index', 'preferential_attachment',
        'cn_soundarajan_hopcroft'
    ]

    X_train = future_connections.loc[
        ~pd.isnull(future_connections['Future Connection']), features]
    y_train = future_connections.loc[
        ~pd.isnull(future_connections['Future Connection']),
        ['Future Connection']]
    X_test = future_connections.loc[(
        pd.isnull(future_connections['Future Connection'])), features]

    classifier = MLPClassifier(hidden_layer_sizes=[10, 5],
                               solver='lbfgs',
                               alpha=10)
    classifier.fit(X_train, y_train)

    y_predicted = classifier.predict_proba(X_test)[:, 1]

    return pd.Series(y_predicted, X_test.index)  # Your Answer Here
Esempio n. 4
0
def new_connections_predictions():

    # Your Code Here
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.metrics import roc_auc_score
    for node in G.nodes():
        G.node[node]['community'] = G.node[node]['Department']
    preferential_attachment = list(nx.preferential_attachment(G))
    df = pd.DataFrame(index=[(x[0], x[1]) for x in preferential_attachment])
    df['preferential_attachment'] = [x[2] for x in preferential_attachment]
    cn_soundarajan_hopcroft = list(nx.cn_soundarajan_hopcroft(G))
    df_cn_soundarajan_hopcroft = pd.DataFrame(
        index=[(x[0], x[1]) for x in cn_soundarajan_hopcroft])
    df_cn_soundarajan_hopcroft['cn_soundarajan_hopcroft'] = [
        x[2] for x in cn_soundarajan_hopcroft
    ]
    df = df.join(df_cn_soundarajan_hopcroft, how='outer')
    df['cn_soundarajan_hopcroft'] = df['cn_soundarajan_hopcroft'].fillna(
        value=0)
    df['resource_allocation_index'] = [
        x[2] for x in list(nx.resource_allocation_index(G))
    ]
    df['jaccard_coefficient'] = [x[2] for x in list(nx.jaccard_coefficient(G))]
    df = future_connections.join(df, how='outer')
    df_train = df[~pd.isnull(df['Future Connection'])]
    df_test = df[pd.isnull(df['Future Connection'])]
    features = [
        'cn_soundarajan_hopcroft', 'preferential_attachment',
        'resource_allocation_index', 'jaccard_coefficient'
    ]
    df_test = df_test[features]
    X_train, X_test, y_train, y_test = train_test_split(
        df_train[features],
        df_train['Future Connection'],
        random_state=0,
        test_size=0.5)
    clf_RF = RandomForestClassifier(max_features=3,
                                    random_state=0,
                                    max_depth=3,
                                    min_samples_leaf=3,
                                    criterion='entropy')
    clf_RF.fit(X_train, y_train)
    clf_GDBT = GradientBoostingClassifier(learning_rate=0.01,
                                          max_depth=8,
                                          random_state=0,
                                          n_estimators=30)
    clf_GDBT.fit(X_train, y_train)
    roc_score_forest = roc_auc_score(y_test,
                                     clf_RF.predict_proba(X_test)[:, 1])
    roc_score = roc_auc_score(y_test, clf_GDBT.predict_proba(X_test)[:, 1])
    print(roc_score_forest)
    print(roc_score)
    #test_proba = clf_RF.predict_proba(X_test)[:, 1]
    preds = pd.Series(data=clf_GDBT.predict_proba(df_test)[:, 1],
                      index=df_test.index)

    return preds  # Your Answer Here
Esempio n. 5
0
def new_connections_predictions():

    pref_atch = list(nx.preferential_attachment(G))
    new_df = pd.DataFrame(index=[(x[0], x[1]) for x in pref_atch])
    new_df["PrefrentialAttachment"] = [x[2] for x in pref_atch]

    cn_soundarajan_hopcroft = list(
        nx.cn_soundarajan_hopcroft(G, community="Department"))
    df_cn_soundarajan_hopcroft = pd.DataFrame(
        index=[(x[0], x[1]) for x in cn_soundarajan_hopcroft])
    df_cn_soundarajan_hopcroft['CommunityCommonNeighbor'] = [
        x[2] for x in cn_soundarajan_hopcroft
    ]
    new_df = new_df.join(df_cn_soundarajan_hopcroft, how='outer')
    new_df['CommunityCommonNeighbor'] = new_df[
        'CommunityCommonNeighbor'].fillna(value=0)

    res_alo = list(nx.resource_allocation_index(G))
    df_res_alo = pd.DataFrame(index=[(x[0], x[1]) for x in res_alo])
    df_res_alo["ResourceAllocationIndex"] = [x[2] for x in res_alo]
    new_df = new_df.join(df_res_alo, how='outer')
    new_df['ResourceAllocationIndex'] = new_df[
        'ResourceAllocationIndex'].fillna(value=0)

    jac_coef = list(nx.jaccard_coefficient(G))
    df_jac_coef = pd.DataFrame(index=[(x[0], x[1]) for x in jac_coef])
    df_jac_coef["JaccardCoeffiecient"] = [x[2] for x in jac_coef]
    new_df = new_df.join(df_jac_coef, how='outer')
    new_df['JaccardCoeffiecient'] = new_df['JaccardCoeffiecient'].fillna(
        value=0)

    new_df = new_df.join(future_connections, how='outer')
    train_df = new_df[~new_df["Future Connection"].isnull()]
    test_df = new_df[new_df["Future Connection"].isnull()]

    features = [
        "PrefrentialAttachment", "CommunityCommonNeighbor",
        "ResourceAllocationIndex", "JaccardCoeffiecient"
    ]
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df["Future Connection"]

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = MLPClassifier(hidden_layer_sizes=[10, 5],
                        alpha=5,
                        random_state=0,
                        solver='lbfgs',
                        verbose=0)
    clf.fit(X_train_scaled, y_train)
    rslt = clf.predict_proba(X_test_scaled)[:, 1]
    final_rslt = pd.Series(rslt, index=X_test.index)

    return final_rslt
Esempio n. 6
0
def soundarajan_hopcroft(G, X):
    soundarajan = []
    for i in range(X.shape[0]):
        try:
            coef = [[u, v, p] for u, v, p in nx.cn_soundarajan_hopcroft(
                G, [(X[i][0], X[i][1])])][0]
            soundarajan.append(coef[2])
        except:
            soundarajan.append(0)
    return soundarajan
Esempio n. 7
0
def calc_disibution(G, newG):
    global newG_E
    newG_E = sample_edges(newG)

    Ja = list(nx.jaccard_coefficient(G, newG_E))
    pickle.dump(Ja, open(Ja_file, 'w'))
    Ja = list(nx.jaccard_coefficient(G, sample_missing_edges(newG, G)))
    pickle.dump(Ja, open(to_negative(Ja_file), 'w'))

    calc_PA()

    CN = list(nx.cn_soundarajan_hopcroft(G, newG_E))  # long time
    pickle.dump(CN, open(CN_file, 'w'))
    CN = list(nx.cn_soundarajan_hopcroft(G, sample_missing_edges(newG, G)))
    pickle.dump(CN, open(to_negative(CN_file), 'w'))

    AA = list(nx.adamic_adar_index(G, newG_E))
    pickle.dump(AA, open(AA_file, 'w'))
    AA = list(nx.adamic_adar_index(G, sample_missing_edges(newG, G)))
    pickle.dump(AA, open(to_negative(AA_file), 'w'))
def generate_positive_features():
    features = []
    count = 0
    print("Generating positive features......")
    for sample in positive_samples:
        if (count % 100 == 0):
            print(count)
        count += 1
        feature = []
        try:
            preds = nx.resource_allocation_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.jaccard_coefficient(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.preferential_attachment(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.cn_soundarajan_hopcroft(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.ra_index_soundarajan_hopcroft(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.within_inter_cluster(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            feature.append(1)  # label=1

        except:
            print("one error at: " + str(count))
            pass
        features.append(feature)
    print("positive features: " + str(len(features)))
    return features
Esempio n. 9
0
def new_connections_predictions():
    for node in G.nodes():
        G.node[node]['community'] = G.node[node]['Department']
    preferential_attachment = list(nx.preferential_attachment(G))
    df = pd.DataFrame(index=[(x[0], x[1]) for x in preferential_attachment])
    df['preferential_attachment'] = [x[2] for x in preferential_attachment]
    cn_soundarajan_hopcroft = list(nx.cn_soundarajan_hopcroft(G))
    df_cn_soundarajan_hopcroft = pd.DataFrame(
        index=[(x[0], x[1]) for x in cn_soundarajan_hopcroft])
    df_cn_soundarajan_hopcroft['cn_soundarajan_hopcroft'] = [
        x[2] for x in cn_soundarajan_hopcroft
    ]
    df = df.join(df_cn_soundarajan_hopcroft, how='outer')
    df['cn_soundarajan_hopcroft'] = df['cn_soundarajan_hopcroft'].fillna(
        value=0)
    df['resource_allocation_index'] = [
        x[2] for x in list(nx.resource_allocation_index(G))
    ]
    df['jaccard_coefficient'] = [x[2] for x in list(nx.jaccard_coefficient(G))]
    df = future_connections.join(df, how='outer')
    df_train = df[~pd.isnull(df['Future Connection'])]
    df_test = df[pd.isnull(df['Future Connection'])]
    features = [
        'cn_soundarajan_hopcroft', 'preferential_attachment',
        'resource_allocation_index', 'jaccard_coefficient'
    ]
    X_train = df_train[features]
    Y_train = df_train['Future Connection']
    X_test = df_test[features]
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = MLPClassifier(hidden_layer_sizes=[10, 5],
                        alpha=5,
                        random_state=0,
                        solver='lbfgs',
                        verbose=0)
    clf.fit(X_train_scaled, Y_train)
    test_proba = clf.predict_proba(X_test_scaled)[:, 1]
    predictions = pd.Series(test_proba, X_test.index)
    target = future_connections[pd.isnull(
        future_connections['Future Connection'])]
    target['prob'] = [predictions[x] for x in target.index]
    return target['prob']
Esempio n. 10
0
def new_connections_predictions():
    for node in G.nodes():
        G.node[node]["community"] = G.node[node]["Department"]
        
    preferential_attachment = list(nx.preferential_attachment(G))
    df_preferential_attachment = pd.DataFrame(index=[(x[0], x[1]) for x in preferential_attachment])
    df_preferential_attachment["preferential_attachment"] = [x[2] for x in preferential_attachment]
    
    cn_soundarajan_hopcroft = list(nx.cn_soundarajan_hopcroft(G))
    df_cn_soundarajan_hopcroft = pd.DataFrame(index=[(x[0], x[1]) for x in cn_soundarajan_hopcroft])
    df_cn_soundarajan_hopcroft["cn_soundarajan_hopcroft"] = [x[2] for x in cn_soundarajan_hopcroft]
    
    df = df_preferential_attachment.join(df_cn_soundarajan_hopcroft, how="outer")
    
    df["cn_soundarajan_hopcroft"] = df["cn_soundarajan_hopcroft"].fillna(value=0)
    df["resource_allocation_index"] = [x[2] for x in list(nx.resource_allocation_index(G))]
    df["jaccard_coefficient"] = [x[2] for x in list(nx.jaccard_coefficient(G))]
    
    df = future_connections.join(df, how="outer")
    
    df["Future Connection"] = df["Future Connection"].fillna(-1)
    future_connections["Future Connection"] = future_connections["Future Connection"].fillna(-1)
    
    features = ["cn_soundarajan_hopcroft", "preferential_attachment", "resource_allocation_index", "jaccard_coefficient"]
    X_train = df[df["Future Connection"]!=-1][features]
    y_train = df[df["Future Connection"]!=-1]["Future Connection"]
    X_test = df[df["Future Connection"]==-1][features]        
    
    scaler = MinMaxScaler()    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = MLPClassifier(alpha=5, random_state=0, solver="lbfgs").fit(X_train_scaled, y_train)
        
    predictions = clf.predict_proba(X_test_scaled)[:, 1]
    predictions_formated = pd.Series(predictions, X_test.index)
    
    result = future_connections[future_connections["Future Connection"]==-1]
    result["probability"] = [predictions_formated[x] for x in result.index]
    return result["probability"]
Esempio n. 11
0
 def cn_soundarajan_hopcroft(self):
     return list(
         nx.cn_soundarajan_hopcroft(self.graph,
                                    [(self.node_1, self.node_2)]))[0][2]
Esempio n. 12
0
import networkx as nx
import operator

G = nx.from_edgelist([('A', 'C'), ('A', 'E'), ('A', 'D'), ('B', 'D'), ('C', 'G'),
                      ('D', 'G'), ('D', 'H'), ('D', 'E'), ('E', 'H'), ('H', 'F')])

res_alloc = list(nx.resource_allocation_index(G))
print(sorted(res_alloc, key=operator.itemgetter(2), reverse=True))

pref_attach = list(nx.preferential_attachment(G))
print(sorted(pref_attach, key=operator.itemgetter(2), reverse=True))

G.node['A']['community']=0
G.node['B']['community']=0
G.node['C']['community']=0
G.node['D']['community']=0
G.node['G']['community']=0
G.node['F']['community']=1
G.node['E']['community']=1
G.node['H']['community']=1

community_common_neigh = list(nx.cn_soundarajan_hopcroft(G))
print(sorted(community_common_neigh, key=operator.itemgetter(2), reverse=True))
community_res_alloc = list(nx.ra_index_soundarajan_hopcroft(G))
print(sorted(community_res_alloc, key=operator.itemgetter(2), reverse=True))

Esempio n. 13
0
def new_connections_predictions():
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score

    future_connections = pd.read_csv(path
                                     + 'Future_Connections.csv',
                                     index_col=0,
                                     converters={0:eval})

    def communities(row):
        """
        Check to whether are in the same department or notself.
        Vectorized for rows, use with pd.DataFrame.apply(x, axis=1)
        """
        nodes = row.name
        a = nodes[0]
        b = nodes[1]
        comm_a = G.node[a]['Department']
        comm_b = G.node[b]['Department']
        if comm_a == comm_b:
            return 1
        else:
            return 0

    future_connections['same_comm'] = future_connections.apply(communities,
                                                               axis=1)
    # For Soundarajan-Hopcroft algorithms.
    for node in G.nodes():
       G.node[node]['community'] = G.node[node]['Department']

    pa = list(nx.preferential_attachment(G))
    pa_df = pd.DataFrame(index=[(i[0], i[1]) for i in pa],
                         data={'pref_att':[i[2] for i in pa]})

    cn = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1]))))
          for e in nx.non_edges(G)]
    cn_df = pd.DataFrame(index=[(i[0], i[1]) for i in cn],
                         data={'comm_neigh':[i[2] for i in cn]})

    cnsh = list(nx.cn_soundarajan_hopcroft(G))
    cnsh_df = pd.DataFrame(index=[(i[0], i[1]) for i in cnsh],
                         data={'sh_comm_neigh':[i[2] for i in cnsh]})

    ra = list(nx.resource_allocation_index(G))
    ra_df = pd.DataFrame(index=[(i[0], i[1]) for i in ra],
                         data={'reso_alloc':[i[2] for i in ra]})

    rash = list(nx.ra_index_soundarajan_hopcroft(G))
    rash_df = pd.DataFrame(index=[(i[0], i[1]) for i in rash],
                         data={'sh_reso_alloc':[i[2] for i in rash]})

    jc = [i for i in nx.jaccard_coefficient(G)]
    jc_df = pd.DataFrame(index=[(i[0], i[1]) for i in jc],
                         data={'jacc_coeff':[i[2] for i in jc]})

    for df in [pa_df, cn_df, cnsh_df, ra_df, rash_df, jc_df]:
        future_connections = future_connections.merge(df, how='left',
                                                      left_index=True,
                                                      right_index=True)

    keep = future_connections[~future_connections['Future Connection'].isnull()]
    hold = future_connections[future_connections['Future Connection'].isnull()]

    X_keep = keep.drop('Future Connection', axis=1)
    y_keep = keep['Future Connection']
    X_hold = hold.drop('Future Connection', axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X_keep, y_keep,
                                                        random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)

    # Check on ROC_AUC performance.
    roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

    probs = clf.predict_proba(X_hold)[:, 1]
    answer = pd.Series(index=X_hold.index,
                       data=probs)
    return answer
Esempio n. 14
0
G = nx.read_edgelist("./data/drugbank_interactions.tsv",
                     delimiter="\t",
                     nodetype=str)

partition = community.best_partition(G)
nx.set_node_attributes(G, name='community', values=partition)

ap = list(all_pairs(G.nodes()))

cn = cn.cnbors(G, ap)
rai = nx.resource_allocation_index(G, ap)
jc = nx.jaccard_coefficient(G, ap)
aai = nx.adamic_adar_index(G, ap)
pa = nx.preferential_attachment(G, ap)
ccn = nx.cn_soundarajan_hopcroft(G, ap)
cra = nx.ra_index_soundarajan_hopcroft(G, ap)
wic = nx.within_inter_cluster(G, ap, community='community')

u, v, s1, s2, s3, s4, s5, s6, s7, s8, has_edge = ([] for i in range(11))
for m1, m2, m3, m4, m5, m6, m7, m8 in zip(cn, rai, jc, aai, pa, ccn, cra, wic):
    u.append(m1[0])
    v.append(m1[1])
    s1.append(m1[2])
    s2.append(m2[2])
    s3.append(m3[2])
    s4.append(m4[2])
    s5.append(m5[2])
    s6.append(m6[2])
    s7.append(m7[2])
    s8.append(m8[2])
Esempio n. 15
0
    def compute_variable(self,
                         variable_name,
                         train: bool,
                         load=True,
                         path_to_file=None,
                         save=True):

        assert variable_name in self.handled_variables, "Variable %s is not handled. Handled variables are : %s" % (
            variable_name, str(self.handled_variables))

        if load and train:
            if path_to_file is None and os.path.isfile(
                    "variables/%s.npy" % variable_name):
                print("Loading STANDARD %s file!" % variable_name)
                result = np.load("variables/%s.npy" % variable_name)
                return result[:self.nb_training_samples]
            elif path_to_file is not None and os.path.isfile(path_to_file):
                print("Loading CUSTOM %s file!" % variable_name)
                result = np.load(path_to_file)
                return result[:self.nb_training_samples]
            print("Did not find saved %s in `variables` folder." %
                  variable_name)

        if load and not train:
            if path_to_file is None and os.path.isfile(
                    "variables/TEST_%s.npy" % variable_name):
                print("Loading STANDARD TEST_%s file!" % variable_name)
                result = np.load("variables/TEST_%s.npy" % variable_name)
                return result[:self.nb_training_samples]
            elif path_to_file is not None and os.path.isfile(path_to_file):
                print("Loading CUSTOM %s file!" % variable_name)
                result = np.load(path_to_file)
                return result[:self.nb_training_samples]
            print("Did not find saved TEST_%s in `variables` folder." %
                  variable_name)

        print("Starting computation of %s..." % variable_name)
        t1 = time()
        gd = self.graph_structure.graph_dicts  # "graph_dictionaries
        if train:
            nb_of_samples = self.nb_training_samples
        else:
            nb_of_samples = self.nb_testing_samples
        result = np.zeros(shape=nb_of_samples)
        for i in range(nb_of_samples):
            if train:
                t = self.train_array[i]
            else:
                t = self.test_array[i]
            if variable_name == "publication_2":
                result[i] = np.log(
                    len(
                        set(self.node_information.loc[t[0], "publication_2"])
                        & set(self.node_information.loc[t[1],
                                                        "publication_2"])) + 1)
            elif variable_name == "adam_coeff":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = \
                            next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g,
                                                                                 [(t[0], t[1])]))[2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = \
                            next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g,
                                                                                 [(t[0], t[1])]))[2]
                else:
                    result[i] = \
                        next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2]
            elif variable_name == "overlapping_words_in_title":
                result[i] = compute_intersection(
                    self.node_information.loc[t[0], "title"],
                    self.node_information.loc[t[1], "title"], self.stemmer,
                    self.stpwds)
            elif variable_name == "number_of_common_authors":
                result[i] = nbr_common_authors(
                    self.node_information.loc[t[0], "author"],
                    self.node_information.loc[t[1], "author"])

            elif variable_name == "difference_of_years":
                result[i] = abs(self.node_information.loc[t[0], 'year'] -
                                self.node_information.loc[t[1], 'year'])

            elif variable_name == "affinity_between_authors":
                result[i] = compute_affinity_between_authors(
                    self.node_information.loc[t[0], 'author'],
                    self.node_information.loc[t[1],
                                              'author'], self.authors_dict)
            elif variable_name == "identical_journal":
                result[i] = np.int(self.node_information.loc[t[0], 'journal']
                                   == self.node_information.loc[t[1],
                                                                'journal'])

            elif variable_name == "l2_distance":
                result[i] = np.linalg.norm(
                    self.node_information.loc[t[0], 'wv'] -
                    self.node_information.loc[t[1], 'wv'])

            elif variable_name == "cosine_distance_tfid":
                v1 = self.node_information.loc[t[0], "wv_tfid"]
                v2 = self.node_information.loc[t[1], "wv_tfid"]
                try:
                    b1 = np.isnan(v1)
                except TypeError:
                    b1 = False
                try:
                    b2 = np.isnan(v2)
                except TypeError:
                    b2 = False
                if not b1 and not b2:
                    result[i] = cosine_similarity(v1, v2)
                else:
                    result[i] = 0

            elif variable_name == "l2_distance_between_titles":
                dst = np.linalg.norm(
                    self.node_information.loc[t[0], 'title_wv'] -
                    self.node_information.loc[t[1], 'title_wv'])
                if np.isnan(dst):
                    result[i] = 0
                else:
                    result[i] = dst

            # elif variable_name == "cosine_distance_between_titles":
            #     result[i] = cosine_distances(
            #         np.nan_to_num(self.node_information.loc[t[0], 'title_wv']).reshape(-1, 1) - (self.node_information.loc[t[1], 'title_wv']).reshape(-1, 1)
            #     )[0][0]

            elif variable_name == "common_neighbors":
                result[i] = len(
                    sorted(
                        nx.common_neighbors(self.graph_structure.g, t[0],
                                            t[1])))

            elif variable_name == "clustering_coeff":
                result[i] = gd["clustering_coeff"][
                    t[0]] * gd["clustering_coeff"][t[1]]

            elif variable_name == "betweenness":
                result[i] = gd["betweenness"][t[0]] * gd["betweenness"][t[1]]

            elif variable_name == "closeness":
                result[i] = gd["closeness"][t[0]] * gd["closeness"][t[1]]

            elif variable_name == "degree":
                result[i] = gd["degree"][t[0]] * gd["degree"][t[1]]

            elif variable_name == "eigenvector":
                result[i] = gd["eigenvector"][t[0]] * gd["eigenvector"][t[1]]

            elif variable_name == "jaccard_coeff":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = next(
                            nx.jaccard_coefficient(self.graph_structure.g,
                                                   [(t[0], t[1])]))[2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = next(
                            nx.jaccard_coefficient(self.graph_structure.g,
                                                   [(t[0], t[1])]))[2]
                else:
                    result[i] = next(
                        nx.jaccard_coefficient(self.graph_structure.g,
                                               [(t[0], t[1])]))[2]
            elif variable_name == "shortest_path":
                if train:
                    if t[2] == 1:
                        assert self.graph_structure.g.has_edge(
                            t[0], t[1]
                        ), "There's a problem with the structure of the graph for id %i and %i" % (
                            t[0], t[1])
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        try:
                            result[
                                i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                    self.graph_structure.g, t[0], t[1])
                        except nx.NetworkXNoPath:
                            result[i] = 0
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        try:
                            result[
                                i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                    self.graph_structure.g, t[0], t[1])
                        except nx.NetworkXNoPath:
                            result[i] = 0
                else:
                    try:
                        result[
                            i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                self.graph_structure.g, t[0], t[1])
                    except nx.NetworkXNoPath:
                        result[i] = 0

            elif variable_name == "pagerank":
                result[i] = gd["pagerank"][t[0]] * gd["pagerank"][t[1]]

            elif variable_name == "community":
                if self.graph_structure.partition[
                        t[0]] == self.graph_structure.partition[t[1]]:
                    result[i] = 1
                else:
                    result[i] = 0

            elif variable_name == "lp_resource_allocation_index":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.resource_allocation_index(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.resource_allocation_index(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.resource_allocation_index(self.graph_structure.g,
                                                     [(t[0], t[1])]))[0][2]

            elif variable_name == "lp_preferential_attachment":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.preferential_attachment(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.preferential_attachment(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.preferential_attachment(self.graph_structure.g,
                                                   [(t[0], t[1])]))[0][2]
            elif variable_name == "lp_cn_soundarajan":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                   [(t[0], t[1])]))[0][2]
            elif variable_name == "lp_ra_index_soundarajan":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.ra_index_soundarajan_hopcroft(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.ra_index_soundarajan_hopcroft(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.ra_index_soundarajan_hopcroft(
                            self.graph_structure.g, [(t[0], t[1])]))[0][2]

            elif variable_name == "lp_within_inter_cluster":

                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.within_inter_cluster(self.graph_structure.g,
                                                    [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.within_inter_cluster(self.graph_structure.g,
                                                    [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.within_inter_cluster(self.graph_structure.g,
                                                [(t[0], t[1])]))[0][2]

        print("Did %s column in %5.1fs" % (variable_name, time() - t1))
        if save and train:
            print("Saved variable %s in `variables` directory." %
                  variable_name)
            np.save("variables/" + variable_name, result)
        if save and not train:
            np.save("variables/TEST_" + variable_name, result)
            print("Saved variable TEST_%s in `variables` directory." %
                  variable_name)
        if np.isnan(result).shape[0] >= 1:
            print("Careful, you have nan values !")
            result[np.isnan(result)] = 0
        return result
 positive_predictions_proba_slpc_PageRank = []
 lenedg = len(pedges)
 cntr = 0
 for edge in pedges:
     cntr += 1
     print("\r   {}/{}".format(cntr, lenedg), end="")
     positive_predictions_proba_jcc.append(
         list(nx.jaccard_coefficient(G, [edge]))[0][2])
     positive_predictions_proba_ra.append(
         list(nx.resource_allocation_index(G, [edge]))[0][2])
     positive_predictions_proba_aa.append(
         list(nx.adamic_adar_index(G, [edge]))[0][2])
     positive_predictions_proba_pa.append(
         list(nx.preferential_attachment(G, [edge]))[0][2])
     positive_predictions_proba_cnsh.append(
         list(nx.cn_soundarajan_hopcroft(
             G, [edge]))[0][2])  # needs community information
     positive_predictions_proba_rash.append(
         list(nx.ra_index_soundarajan_hopcroft(
             G, [edge]))[0][2])  # needs community information
     positive_predictions_proba_wic.append(
         list(nx.within_inter_cluster(
             G, [edge]))[0][2])  # needs community information
     positive_predictions_proba_slp_DegCent.append(
         list(SLP_prediction(G, [edge], centrality="DegCent"))[0][2])
     positive_predictions_proba_slp_EigenCent.append(
         list(SLP_prediction(G, [edge], centrality="EigenCent"))[0][2])
     positive_predictions_proba_slp_ClosenessCent.append(
         list(SLP_prediction(G, [edge], centrality="ClosenessCent"))[0][2])
     positive_predictions_proba_slp_BetweenCent.append(
         list(SLP_prediction(G, [edge], centrality="BetweenCent"))[0][2])
     positive_predictions_proba_slp_PageRank.append(
Esempio n. 17
0
 def cn_soundarajan_hopcroft(uG, ni, nj, rand_node):
     a, b = nx.cn_soundarajan_hopcroft(uG, [(ni, nj), (ni, rand_node)])
     return a[2], b[2]
nx.draw_networkx_labels(G,
                        pos,
                        labels={u: u
                                for t in candidate_edges for u in t},
                        font_size=13,
                        font_weight='bold',
                        font_color='yellow')

plt.axis('off')
plt.tight_layout()
plt.show()

# Create a data frame to store various centrality measures.
df = pd.DataFrame(index=candidate_edges)

# Add generic and community aware edge features for potential machine learning classification.
df['pref-att'] = list(
    map(operator.itemgetter(2), nx.preferential_attachment(G,
                                                           candidate_edges)))
df['jaccard-c'] = list(
    map(operator.itemgetter(2), nx.jaccard_coefficient(G, candidate_edges)))
df['aa-idx'] = list(
    map(operator.itemgetter(2), nx.adamic_adar_index(G, candidate_edges)))
df['ccn'] = list(
    map(operator.itemgetter(2),
        nx.cn_soundarajan_hopcroft(G, candidate_edges, 'club')))
df['cra'] = list(
    map(operator.itemgetter(2),
        nx.ra_index_soundarajan_hopcroft(G, candidate_edges, 'club')))

print(df)
Esempio n. 19
0
# Resources Allocation (sum of fractions of the end node receive from middle nodes based on their degrees)
ra = list(nx.resource_allocation_index(g))
# Adamic-Adar Index (Resources Allocation with log of degrees)
aa = list(nx.adamic_adar_index(g))
# Preferential Attachment (product of nodes' degree)
pa = list(nx.preferential_attachment(g))
# Community Common Neighbors (with bonus for nieghbors in the same community)
g.nodes[0]['community'] = 0
g.nodes[1]['community'] = 1
g.nodes[2]['community'] = 0
g.nodes[3]['community'] = 1
g.nodes[4]['community'] = 1
g.nodes[5]['community'] = 0
g.nodes[6]['community'] = 1
g.nodes[7]['community'] = 1
g.nodes[8]['community'] = 0
g.nodes[9]['community'] = 0
ccn = list(nx.cn_soundarajan_hopcroft(g))
# Community Resource Allocation (only consider nodes in the same community)
g.nodes[0]['community'] = 0
g.nodes[1]['community'] = 1
g.nodes[2]['community'] = 0
g.nodes[3]['community'] = 1
g.nodes[4]['community'] = 1
g.nodes[5]['community'] = 0
g.nodes[6]['community'] = 1
g.nodes[7]['community'] = 1
g.nodes[8]['community'] = 0
g.nodes[9]['community'] = 0
cra = list(nx.ra_index_soundarajan_hopcroft(g))
Esempio n. 20
0
# Community Common Neighbors
# Number of common neighbors with bonus of 1 for each neighbor in same community
# f(u) = 1 if same community else 0
# sum(f(u) * degree)
G = nx.newman_watts_strogatz_graph(9, 5, 0.1)
G.nodes()
G.node[0]['community'] = 0
G.node[1]['community'] = 0
G.node[2]['community'] = 0
G.node[3]['community'] = 0
G.node[4]['community'] = 1
G.node[5]['community'] = 1
G.node[6]['community'] = 1
G.node[7]['community'] = 1
G.node[8]['community'] = 1
L = list(nx.cn_soundarajan_hopcroft(G))
L.sort(key=operator.itemgetter(2), reverse=True); L

# Measure 7:
# Community Resource Allocation
# Similar to resource allocation index, but only considering nodes in the same community
# f(u) = 1 if same community else 0
# sum(f(u)/degree)
L = list(nx.ra_index_soundarajan_hopcroft(G))
L.sort(key=operator.itemgetter(2), reverse=True); L

# Summary
# • Link prediction problem: Given a network, predict which edges will be formed in the future.
# • 5 basic measures:
# – NumberofCommonNeighbors – JaccardCoefficient
# – ResourceAllocationIndex
Esempio n. 21
0
def new_connections_predictions():
    import operator
    # Import preprocessing, selection and metrics
    from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
    from sklearn.metrics import roc_auc_score
    from sklearn.dummy import DummyClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC, LinearSVC

    # Your Code Here
    df_fc_test_mask = pd.isnull(future_connections.loc[:, 'Future Connection'])
    df = pd.DataFrame()

    # Measure 1: Common Neighbors (intercept)
    # The number of common neighbors of nodes 𝑋 and 𝑌
    #     future_connections['common_neigh']
    L = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1]))))
         for e in nx.non_edges(G)]
    df['pair'] = [(x, y) for x, y, z in L]
    df['common_nb'] = [z for x, y, z in L]
    #     L.sort(key=operator.itemgetter(2), reverse=True)
    #     print(L)

    # Measure 2: Jaccard Coefficient (intercept over union)
    # Number of common neighbors normalized by the total number of neighbors
    # common_neighbors/total_neighbors
    #     future_connections['jaccard']
    df['jaccard'] = pd.Series([z for x, y, z in nx.jaccard_coefficient(G)])
    #     L.sort(key=operator.itemgetter(2), reverse=True)
    #     print(L)

    # Measure 3: Resource
    # Fraction of a ”resource” that a node can send to another through their common neighbors
    # sum(1/degree_common_neighbor)
    df['resource'] = pd.Series([z for x, y, z in nx.resource_allocation_index(G)])
    #     L.sort(key=operator.itemgetter(2), reverse=True)
    #     print(L)

    # Measure 4:
    # Adamic Adar Index
    # Similar to resource allocation index, but with log in the denominator
    # sum(1/log(degree_common_neighbor))
    future_connections['adamic_adar'] = pd.Series([z for x, y, z in nx.adamic_adar_index(G)])
    #     L.sort(key=operator.itemgetter(2), reverse=True)
    #     print(L)

    # Method 5:
    # Preferential Attachment
    # In the preferential attachment model, nodes with high degree get more neighbors
    # degree_source * degree_target
    future_connections['pref_att'] = pd.Series([z for x, y, z in nx.preferential_attachment(G)])
    #     print(L)

    # Measure 6:
    # Community Common Neighbors
    # Number of common neighbors with bonus of 1 for each neighbor in same community
    # f(u) = 1 if same community else 0
    # sum(f(u) * degree)
    for i, dept in enumerate(nx.get_node_attributes(G, 'Department')):
        G.node[i]['community'] = dept
    future_connections['com_common_nb'] = pd.Series([z for x, y, z in nx.cn_soundarajan_hopcroft(G)])
    #     L.sort(key=operator.itemgetter(2), reverse=True)
    #     print(L)

    # Measure 7:
    # Community Resource Allocation
    # Similar to resource allocation index, but only considering nodes in the same community
    # f(u) = 1 if same community else 0
    # sum(f(u)/degree)
    future_connections['com_resource'] = pd.Series([z for x, y, z in nx.ra_index_soundarajan_hopcroft(G)])
    #     L.sort(key=operator.itemgetter(2), reverse=True)
    #     print(L)

    print(df.head())

    #     #
    #     df_fc_train = future_connections.loc[~df_fc_test_mask, :]
    #     df_fc_test = future_connections.loc[df_fc_test_mask, :]
    #     y_train = df_fc_train.loc[:, 'Future Connection']
    #     y_test = df_fc_test.loc[:, 'Future Connection']
    #     X_train = df_fc_train.index
    #     X_test = df_fc_test.index

    #     def auc_scores(model, *args, k=5, threshold=0.50):
    #         """CV scores"""
    #         X, y = args
    #         predictions = cross_val_predict(model, X, y, cv=k, n_jobs=-1)
    #         print('AUC - Test predict  {:.2%}'.format(roc_auc_score(y, predictions)))

    #     classifiers = [
    # #         GaussianNB(),
    # #         DecisionTreeClassifier(random_state=0),
    # #         DecisionTreeClassifier(max_depth=3, random_state=0),
    # #         DecisionTreeClassifier(max_depth=4, random_state=0),
    # #         DecisionTreeClassifier(max_depth=5, random_state=0),
    # #         DecisionTreeClassifier(max_depth=6, random_state=0),
    #         GradientBoostingClassifier(random_state=0),
    # #         GradientBoostingClassifier(learning_rate=0.08, random_state=0),
    # #         GradientBoostingClassifier(learning_rate=0.12, random_state=0),
    # #         GradientBoostingClassifier(learning_rate=0.1, max_depth=3, random_state=0),
    # #         GradientBoostingClassifier(learning_rate=0.1, max_depth=4, random_state=0),
    # #         RandomForestClassifier(n_estimators=100, random_state=0),
    # #         AdaBoostClassifier(learning_rate=0.1, n_estimators=100, random_state=0),
    # #         KNeighborsClassifier(),
    # #         KNeighborsClassifier(n_neighbors=4),
    # #         LinearSVC(random_state=0)
    #         ]

    #     for model in classifiers:
    # #         print('-'*80)
    # #         print(model)

    #         # Training scores
    # #         clf_train = model.fit(X_train, y_train)
    # #         pred_train = clf_train.predict(X_train)
    # #         print('AUC - Train pred    {:.2%}'.format(roc_auc_score(y_train, pred_train)))

    #         # CV scores
    #         clf = model.fit(X_train, y_train)
    # #         auc_scores(clf, X_train, y_train)

    #     # Predict
    #     predicted = clf.predict(X_test)
    #     pred_series = pd.Series(predicted)
    #     assert type(pred_series) == pd.Series, 'wtf: ' + str(type(pred_series))

    return pred_series
Esempio n. 22
0
    print('(%d, %d) -> %.8f' % (u, v, p))

#%%
G = nx.complete_graph(5)
preds = nx.resource_allocation_index(G, [(0, 1), (2, 3)])
for u, v, p in preds:
    print('(%d, %d) -> %.8f' % (u, v, p))

#%%
import networkx as nx
G = nx.path_graph(3)
nx.draw_networkx(G)
G.node[0]['community'] = 0
G.node[1]['community'] = 0
G.node[2]['community'] = 0
preds = nx.cn_soundarajan_hopcroft(G, [(0, 2)])
for u, v, p in preds:
    print('(%d, %d) -> %d' % (u, v, p))

#%%
import networkx as nx
G = nx.Graph()
G.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 3)])
G.node[0]['community'] = 0
G.node[1]['community'] = 0
G.node[2]['community'] = 1
G.node[3]['community'] = 0
nx.draw_networkx(G)
preds = nx.ra_index_soundarajan_hopcroft(G, [(0, 3)])
for u, v, p in preds:
    print('(%d, %d) -> %.8f' % (u, v, p))
Esempio n. 23
0
import networkx as nx

G = nx.Graph()

G.add_edges_from([('A', 'C'), ('A', 'D'), ('A', 'E'), ('B', 'D'), ('G', 'C'),
                  ('D', 'E'), ('D', 'G'), ('D', 'H'), ('E', 'H'), ('F', 'H')])

G.node['A']['community'] = 0
G.node['B']['community'] = 0
G.node['C']['community'] = 0
G.node['D']['community'] = 0
G.node['E']['community'] = 1
G.node['F']['community'] = 1
G.node['G']['community'] = 0
G.node['H']['community'] = 1

print(list(nx.cn_soundarajan_hopcroft(G)))
def get_communit_common(G):
    cc = list(nx.cn_soundarajan_hopcroft(G))
    cc.sort(key=operator.itemgetter(2), reverse=True)
    return cc
Esempio n. 25
0
#         (107, 348)    0.35
#         (542, 751)    0.40
#         (20, 426)     0.55
#         (50, 989)     0.35
#                   ...
#         (939, 940)    0.15
#         (555, 905)    0.35
#         (75, 101)     0.65
#         Length: 122112, dtype: float64

# In[117]:

resource_allocation = list(nx.resource_allocation_index(G))
adamic_adar_index = nx.adamic_adar_index(G)
preferential_attachment = list(nx.preferential_attachment(G))
cn_soundarajan_hopcroft = nx.cn_soundarajan_hopcroft(G)

# In[ ]:


def train_test_data():
    train = future_connections[~pd.
                               isnull(future_connections['Future Connection'])]
    test = future_connections[pd.isnull(
        future_connections['Future Connection'])]

    pa = pd.DataFrame(data=preferential_attachment)
    pa.index = [pa[0], pa[1]]
    pa = pa.drop([0, 1], 1)
    #     for index, row in train.iterrows():
    #         print("{} {} ".format(index, pa.loc[index[0], index[1]][2]))
Esempio n. 26
0
def get_features(L, flag):
    X = [[] for i in range(len(L))]

    #=====================Social features(user-to-user graph)======================

    #g0.adamic adar score
    if flag['g0'] is True:
        print("get feature g0")
        preds = nx.adamic_adar_index(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g1.jaccard coefficient
    if flag['g1'] is True:
        print("get feature g1")
        preds = nx.jaccard_coefficient(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1
    #g2.resource_allocation
    if flag['g2'] is True:
        print("get feature g2")
        preds = nx.resource_allocation_index(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g3.preferentail_attachment
    if flag['g3'] is True:
        print("get feature g3")
        preds = nx.preferential_attachment(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g4.shortest path length
    if flag['g4'] is True:
        print("get feature g4")
        cnt = 0
        for (u, v) in L:
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                if nx.has_path(G, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(G, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
                G.add_edge(u, v)
            else:
                if nx.has_path(G, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(G, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
            cnt += 1

    #g5.common neighbors
    if flag['g5'] is True:
        print("get feature g5")
        cnt = 0
        for (u, v) in L:
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                T = [w for w in nx.common_neighbors(G, u, v)]
                G.add_edge(u, v)
            else:
                T = [w for w in nx.common_neighbors(G, u, v)]
            X[cnt].append(len(T))
            cnt += 1

    #g6.Approximate katz for social graph
    if flag['g6'] is True:
        print("get feature g6")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for x in G.neighbors(u):
                    for y in G.neighbors(v):
                        if x == y or G.has_edge(x, y):
                            p += 1
                G.add_edge(u, v)
            else:
                for x in G.neighbors(u):
                    for y in G.neighbors(v):
                        if x == y or G.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    if flag['g7'] is True:
        print("get feature g7")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g8'] is True:
        print("get feature g8")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                if line == "":
                    continue
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g9'] is True:
        print("get feature g9")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.within_inter_cluster(G, L, delta=0.5)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g10'] is True:
        print("get feature g10")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.cn_soundarajan_hopcroft(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['g11'] is True:
        print("get feature g11")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.ra_index_soundarajan_hopcroft(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['g12'] is True:
        print("get feature g12")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.within_inter_cluster(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.within_inter_cluster(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds
    #=========================checkin features=========================================
    #c0.follower number
    if flag['c0'] is True:
        print("get feature c0")
        cnt = 0
        for (u, v) in L:
            X[cnt].append(U[u]['follow_cnt'] * U[v]['follow_cnt'])  # fu*fv
            cnt += 1

    #c1.same time same location
    if flag['c1'] is True:
        print("get feature c1")
        cnt = 0
        for (u, v) in L:
            p = calculate_CCC(G, u, v)
            X[cnt].append(p)
            cnt += 1

    #c2.same time same distinct spot
    if flag['c2'] is True:
        print("get deature c2")
        cnt = 0
        for (u, v) in L:
            p = 0
            dis_same_spot = []
            for k in C[u]:
                if k[1] not in dis_same_spot and k in C[v]:
                    dis_same_spot.append(k[1])
                    p += 1
            X[cnt].append(p)
            cnt += 1

    #c3.same distinct spot (not necessarily same time)
    if flag['c3'] is True:
        cnt = 0
        print("get feature c3")
        for (u, v) in L:
            p = 0
            dis_same_spot = []
            for k in C[u]:
                if k[1] not in dis_same_spot:
                    for m in C[v]:
                        if k[1] == m[1]:
                            dis_same_spot.append(k[1])
                            p += 1
                            break
            X[cnt].append(p)
            cnt += 1

    #c4.min Entropy
    if flag['c4'] is True:
        print("get feature c4")
        cnt = 0
        for (u, v) in L:
            p = 0
            E_list = []
            for k in C[u]:
                if k in C[v]:
                    spot = k[1]
                    if spot in S and S[spot]['entropy'] > 0:
                        E_list.append(S[spot]['entropy'])
            if len(E_list) > 0:
                p = min(E_list)
            X[cnt].append(p)
            cnt += 1

    #c5. distance of mean_LL
    if flag['c5'] is True:
        cnt = 0
        print("get feature c5")
        for (u, v) in L:
            dist = np.sqrt((U[u]['mean_LL'][0] - U[v]['mean_LL'][0])**2 +
                           (U[u]['mean_LL'][1] - U[v]['mean_LL'][1])**2)
            X[cnt].append(dist)
            cnt += 1

    #c6.weighted same location
    if flag['c6'] is True:
        print("get feature c6")
        cnt = 0
        for (u, v) in L:
            p = 0
            for k in C[u]:
                if k in C[v]:
                    spot = k[1]
                    #if spot in S and S[spot]['entropy'] > 0:
                    #p += 1/S[spot]['entropy']
                    if spot in S:
                        dist = np.sqrt(
                            (S[spot]['LL'][0] - U[u]['mean_LL'][0])**2 +
                            (S[spot]['LL'][1] - U[u]['mean_LL'][1])**2)
                        p += dist
                        dist = np.sqrt(
                            (S[spot]['LL'][0] - U[v]['mean_LL'][0])**2 +
                            (S[spot]['LL'][1] - U[v]['mean_LL'][1])**2)
                        p += dist
            X[cnt].append(p)
            cnt += 1

    #c7.PP
    if flag['c7'] is True:
        print("get feature c7")
        cnt = 0
        for (u, v) in L:
            p = len(C[u]) * len(C[v])
            X[cnt].append(p)
            cnt += 1

    #c8.Total Common Friend Closeness (TCFC)
    if flag['c8'] is True:
        print("get feature c8")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for w in nx.common_neighbors(G, u, v):
                    T1 = [x for x in nx.common_neighbors(G, u, w)]
                    T2 = [x for x in nx.common_neighbors(G, v, w)]
                    p += len(T1) * len(T2)
                G.add_edge(u, v)
            else:
                for w in nx.common_neighbors(G, u, v):
                    T1 = [x for x in nx.common_neighbors(G, u, w)]
                    T2 = [x for x in nx.common_neighbors(G, v, w)]
                    p += len(T1) * len(T2)
            X[cnt].append(p)
            cnt += 1

    #c9.Total Common friend Checkin Count (TCFCC)
    if flag['c9'] is True:
        print("get feature c9")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for w in nx.common_neighbors(G, u, v):
                    p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w)
                G.add_edge(u, v)
            else:
                for w in nx.common_neighbors(G, u, v):
                    p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w)
            X[cnt].append(p)
            cnt += 1

    #c10. Common Category Checkin Counts Product (CCCP)
    if flag['c10'] is True:
        print("get feature c10")
        cnt = 0
        for (u, v) in L:
            p = 0
            for cat in U[u]['cate']:
                if cat in U[v]['cate']:
                    p += U[u]['cate'][cat] * U[v]['cate'][cat]
            X[cnt].append(p)
            cnt += 1

    #c11. Common Category Checkin Counts Product Ratio(CCCPR)
    if flag['c11'] is True:
        print("get feature c11")
        cnt = 0
        for (u, v) in L:
            p = 0
            u_cate_total = sum(U[u]['cate'][cat]**2 for cat in U[u]['cate'])
            v_cate_total = sum(U[v]['cate'][cat]**2 for cat in U[v]['cate'])
            for cat in U[u]['cate']:
                if cat in U[v]['cate']:
                    p += (U[u]['cate'][cat] * U[v]['cate'][cat] /
                          np.sqrt(u_cate_total * v_cate_total))
            X[cnt].append(p)
            cnt += 1

#c12.trip route length all
    if flag['c12'] is True:
        print("get feature c12")
        cnt = 0
        for (u, v) in L:
            tripDayLen1 = list()
            tripDayLen2 = list()
            tripDay = "starting"
            tripLen = 0.0
            lastSpot = [0.0, 0.0]
            for k in C[u]:
                if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0):
                    if k[1] in S:
                        tripLen += np.sqrt((lastSpot[0] -
                                            S[k[1]]['LL'][0])**2 +
                                           (lastSpot[1] - S[k[1]]['LL'][1])**2)
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
                else:
                    if k[1] in S:
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
            tripDay = "starting"
            tripLen2 = 0.0
            lastSpot = [0.0, 0.0]
            for k in C[v]:
                if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0):
                    if k[1] in S:
                        tripLen2 += np.sqrt(
                            (lastSpot[0] - S[k[1]]['LL'][0])**2 +
                            (lastSpot[1] - S[k[1]]['LL'][1])**2)
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
                else:
                    if k[1] in S:
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
            X[cnt].append(tripLen + tripLen2)
            cnt += 1

    #=========================Heter Graph features=====================================

    #h0.Approximate katz for bipartite graph
    if flag['h0'] is True:
        print("get feature h0")
        cnt = 0
        for (u, v) in L:
            p = 0
            for x in B.neighbors(u):
                for y in B.neighbors(v):
                    if x == y or B.has_edge(x, y):
                        p += 1
            X[cnt].append(p)
            cnt += 1

    #h1.Approximate katz on HB
    if flag['h1'] is True:
        print("get feature h1")
        cnt = 0
        for (u, v) in L:
            p = 0
            if HB.has_edge(u, v):
                HB.remove_edge(u, v)
                for x in HB.neighbors(u):
                    for y in HB.neighbors(v):
                        if x == y or HB.has_edge(x, y):
                            p += 1
                HB.add_edge(u, v)
            else:
                for x in HB.neighbors(u):
                    for y in HB.neighbors(v):
                        if x == y or HB.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #h2.Approximate katz on H
    if flag['h2'] is True:
        print("get feature h2")
        cnt = 0
        for (u, v) in L:
            p = 0
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
                H.add_edge(u, v)
            else:
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #h3.shortest path length on B
    if flag['h3'] is True:
        print("get feature h3")
        cnt = 0
        for (u, v) in L:
            if nx.has_path(B, u, v):
                X[cnt].append(
                    nx.shortest_path_length(B, source=u, target=v) / 50000)
            else:
                X[cnt].append(1)
            cnt += 1

    #h4.clustering coefiicient on H
    if flag['h4'] is True:
        print("get feature h4")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                p = nx.clustering(H, u) * nx.clustering(H, v)
                H.add_edge(u, v)
            else:
                p = nx.clustering(H, u) * nx.clustering(H, v)
            X[cnt].append(p)
            cnt += 1

    #h5. number of (user's loc friends)'s loc friends
    if flag['h5'] is True:
        print("get feature h5")
        cnt = 0
        for (u, v) in L:
            counter1 = 0
            for neighbor in H.neighbors(u):
                if not neighbor.isnumeric():
                    for neighbor2 in H.neighbors(neighbor):
                        if not neighbor.isnumeric():
                            counter1 += 1
            counter2 = 0
            for neighbor in H.neighbors(v):
                if not neighbor.isnumeric():
                    for neighbor2 in H.neighbors(neighbor):
                        if not neighbor.isnumeric():
                            counter2 += 1

            #print(str(counter1)+" "+str(counter2)+"\n")
            X[cnt].append(counter1 * counter2)
            cnt += 1

    #h6. location friends' degree sum
    if flag['h6'] is True:
        print("get feature h6")
        cnt = 0
        for (u, v) in L:
            counter1 = 0
            for locationNeighbor in H.neighbors(u):
                if not locationNeighbor.isnumeric():
                    #print(str(locationNeighbor)+"\n")
                    if locationNeighbor in LG:
                        counter1 += LG.degree(locationNeighbor)

            counter2 = 0
            for locationNeighbor in H.neighbors(v):
                if not locationNeighbor.isnumeric():
                    if locationNeighbor in LG:
                        counter2 += LG.degree(locationNeighbor)
            X[cnt].append(counter1 * counter2)
            cnt += 1

    #h7. Approximate katz for social graph
    if flag['h7'] is True:
        print("get feature h7")
        cnt = 0
        for (u, v) in L:
            counter = 0
            for node in H.neighbors(u):
                if not node.isnumeric():
                    for node2 in H.neighbors(v):
                        if not node2.isnumeric():
                            if node == node2 or H.has_edge(node, node2):
                                counter += 1
            X[cnt].append(counter)
            cnt += 1

    #h8. adamic adar score on H
    if flag['h8'] is True:
        print("get feature h8")
        preds = nx.adamic_adar_index(H, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1
    #h9. resource_allocation on H
    if flag['h9'] is True:
        print("get feature h9")
        preds = nx.resource_allocation_index(H, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #h10. shortest path length on H
    if flag['h10'] is True:
        print("get feature h10")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                if nx.has_path(H, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(H, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
                H.add_edge(u, v)
            else:
                if nx.has_path(H, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(H, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
            cnt += 1
    #h11. common neighbors on H
    if flag['h11'] is True:
        print("get feature h11")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                T = [w for w in nx.common_neighbors(H, u, v)]
                H.add_edge(u, v)
            else:
                T = [w for w in nx.common_neighbors(H, u, v)]
            X[cnt].append(len(T))
            cnt += 1

    #h12.Approximate katz for social graph
    if flag['h12'] is True:
        print("get feature h12")
        cnt = 0
        for (u, v) in L:
            p = 0
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
                H.add_edge(u, v)
            else:
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    if flag['h13'] is True:
        print("get feature h13")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h14'] is True:
        print("get feature h14")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                if line == "":
                    continue
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h15'] is True:
        print("get feature h15")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.within_inter_cluster(HB, L, delta=0.5)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h16'] is True:
        print("get feature h16")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.cn_soundarajan_hopcroft(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['h17'] is True:
        print("get feature h17")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['h18'] is True:
        print("get feature h18")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.within_inter_cluster(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.within_inter_cluster(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    return X
Esempio n. 27
0
            if (i == train_edge_num):
                print 'G.number_of_nodes():', G.number_of_nodes()
                print 'G.number_of_edges():', G.number_of_edges()
                print 'Core size:', len(Core)
                print 'Begin predict..'
                st = time.time()
                predict = get_predict(G, Core)
                print 'Predict finished, time: ', time.time() - st
        else:  # predict
            if (not u in Core) or (not v in Core):
                continue
            edgen_new += 1
            newG.add_edge(u, v)
            # if i==edgen: break
        i += 1
    CN = nx.cn_soundarajan_hopcroft(G, newG.edges())  # long time
    CN = list(CN)
    pickle.dump(CN, open(CN_file, 'w'))
    CN = nx.cn_soundarajan_hopcroft(G, sample_missing_edges(newG, G))
    pickle.dump(CN, open(to_negative(CN_file), 'w'))
    AA = nx.adamic_adar_index(G, newG.edges())
    pickle.dump(AA, open(AA_file, 'w'))
    AA = nx.adamic_adar_index(G, sample_missing_edges(newG, G))
    pickle.dump(AA, open(to_negative(AA_file), 'w'))

    prediction = sorted(predict, key=lambda x: x[-1],
                        reverse=True)  #key=lambda x:x[-1]
    edgen_new = newG.number_of_edges()
    #	assert

    print 'newG.number_of_nodes():', newG.number_of_nodes()