def preferential_attachment_scores(g_train, train_test_split):
    if g_train.is_directed(): # Only defined for undirected graphs
        g_train = g_train.to_undirected()

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
        test_edges, test_edges_false = train_test_split # Unpack input

    start_time = time.time()
    pa_scores = {}

    # Calculate scores
    pa_matrix = np.zeros(adj_train.shape)
    for u, v, p in nx.preferential_attachment(g_train, ebunch=get_ebunch(train_test_split)): # (u, v) = node indices, p = Jaccard coefficient
        pa_matrix[u][v] = p
        pa_matrix[v][u] = p # make sure it's symmetric
    pa_matrix = pa_matrix / pa_matrix.max() # Normalize matrix

    runtime = time.time() - start_time
    pa_roc, pa_ap = get_roc_score(test_edges, test_edges_false, pa_matrix)

    pa_scores['test_roc'] = pa_roc
    # pa_scores['test_roc_curve'] = pa_roc_curve
    pa_scores['test_ap'] = pa_ap
    pa_scores['runtime'] = runtime
    return pa_scores
Beispiel #2
0
def feature_extractor(graph, samples, deg_centrality):
    """
    Creates a feature vector for each edge of the graph contained in samples 
    """
    feature_vector = []
    number_nodes_out = 0

    for edge in tqdm(samples):
        source_node, target_node = edge[0], edge[1]

        # Degree Centrality
        if (source_node not in list(
                deg_centrality.keys())) or (target_node not in list(
                    deg_centrality.keys())):
            feature_vector.append(np.array([0, 0, 0, 0, 0, 0]))
            number_nodes_out += 1

        else:

            source_degree_centrality = deg_centrality[source_node]
            target_degree_centrality = deg_centrality[target_node]

            # # Betweeness centrality measure
            # diff_bt = betweeness_centrality[target_node] - betweeness_centrality[source_node]

            # Preferential Attachement
            pref_attach = list(
                nx.preferential_attachment(graph,
                                           [(source_node, target_node)]))[0][2]

            # AdamicAdar
            aai = list(
                nx.adamic_adar_index(graph,
                                     [(source_node, target_node)]))[0][2]

            # Jaccard
            jacard_coeff = list(
                nx.jaccard_coefficient(graph,
                                       [(source_node, target_node)]))[0][2]
            # Ressource allocation index
            res_all = list(
                nx.resource_allocation_index(
                    graph, [(source_node, target_node)]))[0][2]

            # Create edge feature vector with all metric computed above
            feature_vector.append(
                np.array([
                    source_degree_centrality, target_degree_centrality,
                    pref_attach, aai, jacard_coeff, res_all
                ]))
    print(f"Number nodes out: {number_nodes_out}")

    return np.array(feature_vector)
Beispiel #3
0
def preferential_attachment(G):
    graph_preferential_attachment = nx.Graph()
    file = open("FOF_edges.txt", "rb")
    fof_graph = nx.read_edgelist(file, delimiter=',')
    #assign the PA for G based on freind to friend edges and then add them iterator
    cal = nx.preferential_attachment(G, ebunch=fof_graph.edges())
    #from PA iterator create a graph
    for u, v, x in cal:
        graph_preferential_attachment.add_edge(u, v, score=x)
    file2 = open("graph_preferential_attachment.txt", "wb+")
    nx.write_edgelist(graph_preferential_attachment, file2, delimiter=',')
    print(len(graph_preferential_attachment))
def print_sim_node(g, x=3003425278, y=3003475283):
    print("vertex pair:", x, "and", y)
    print("n of neighbors", x, ":", len(list(g.neighbors(x))))
    print("n of neighbors", y, ":", len(list(g.neighbors(y))))
    print("degree of", x, ":", g.degree(x))
    print("degree of", y, ":", g.degree(y))

    print("common neighbosr:", len(list(nx.common_neighbors(g, x, y))))
    print("Jaccard coefficient:",
          list(nx.jaccard_coefficient(g, [(x, y)]))[0][2])
    print("Adamic/Adar:", list(nx.adamic_adar_index(g, [(x, y)]))[0][2])
    print("preferential attachment:",
          list(nx.preferential_attachment(g, [(x, y)]))[0][2])
def aggregated_dataset(all_dfs, g_undirected):
    aggregated_df = all_dfs.sort_values(by='Timestamp', ascending=True)
    aggregated_df = all_dfs.groupby(['id1', 'id2', 'type'],
                                    as_index=False)['weight'].sum()
    aggregated_df = aggregated_df.set_index(['id1', 'id2'])

    aggregated_df['preferential attachment'] = [
        i[2]
        for i in nx.preferential_attachment(g_undirected, aggregated_df.index)
    ]
    aggregated_df['Common Neighbors'] = aggregated_df.index.map(
        lambda id: len(list(nx.common_neighbors(g_undirected, id[0], id[1]))))
    aggregated_df['label'] = 1
    aggregated_df.to_pickle("./dummy.pkl")
    return aggregated_df
Beispiel #6
0
def link_prediction(G):
    # predictions = []
    predictions1 = nx.resource_allocation_index(G, G.edges())
    predictions2 = nx.jaccard_coefficient(G, G.edges())
    predictions3 = nx.adamic_adar_index(G, G.edges())
    predictions4 = nx.preferential_attachment(G, G.edges())
    # predictions.extend([predictions1, predictions2, predictions3, predictions4])
    lst = []
    try:
        for u, v, p in predictions1:
            lst.append((u, v, p))
            print('(%d, %d) -> %.8f' % (u, v, p))
    except ZeroDivisionError:
        print("ZeroDivisionError: float division by zero")
    x = 1
Beispiel #7
0
    def metric_coefficients(graph, df_train, df_test):
        """
        Detail:
            It computes the metric coefficients like jaccard, adamic,preferential attachment and resource allocation
        Arguments:
            graph -> nx.Graph()
            df_train -> pd.DataFrame()
            df_test -> pd.DataFrame()
        Return:
            df_train -> pd.DataFrame()
            df_test -> pd.DataFrame()

        """
        filename_testing = os.path.join(Setup.path_project(__file__), "data",
                                        "testing.txt")
        filename_training = os.path.join(Setup.path_project(__file__), "data",
                                         "training.txt")

        for filename, df in zip([filename_training, filename_testing],
                                [df_train, df_test]):
            jaccard = []
            adamic_adar = []  # Adamic-Adar inde
            pa = []  # preferential attachment
            ra = []  # resource allocation

            with open(filename, "r") as f:
                for line in f:
                    line = line.split()
                    for u, v, p in nx.jaccard_coefficient(
                            graph, [(line[0], line[1])]):
                        jaccard.append(p)
                    for u, v, p in nx.adamic_adar_index(
                            graph, [(line[0], line[1])]):
                        adamic_adar.append(p)
                    for u, v, p in nx.preferential_attachment(
                            graph, [(line[0], line[1])]):
                        pa.append(p)
                    for u, v, p in nx.resource_allocation_index(
                            graph, [(line[0], line[1])]):
                        ra.append(p)

            df["Jaccard"] = jaccard
            df["Adamic-Adar"] = adamic_adar
            df["Preferential Attachment"] = pa
            df["Resource Allocation"] = ra

        return df_train, df_test
Beispiel #8
0
def new_connections_predictions():

    common_neigh = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)]
    df1 = pd.DataFrame(index = [(item[0],item[1]) for item in common_neigh])
    df1['common_neigh'] = [item[2] for item in common_neigh]
    
    Jaccard_coef = list(nx.jaccard_coefficient(G))
    df2 = pd.DataFrame(index = [(item[0],item[1]) for item in Jaccard_coef])
    df2['Jaccard_coef'] = [item[2] for item in Jaccard_coef]
    
    Resource_allocation = list(nx.resource_allocation_index(G))
    df3 = pd.DataFrame(index = [(item[0],item[1]) for item in Resource_allocation])
    df3['Resource_allocation'] = [item[2] for item in Resource_allocation]
    
    preferential_attachment = list(nx.preferential_attachment(G))
    df4 = pd.DataFrame(index = [(item[0],item[1]) for item in preferential_attachment])
    df4['preferential_attachment'] = [item[2] for item in preferential_attachment]
    
    connections = df1.join(df2, how = 'inner').join(df3, how = 'inner').join(df4, how = 'inner')
    future_connections_mixed = future_connections.join(connections, how = 'left')
    
    future_connections_missing = future_connections_mixed[future_connections_mixed['Future Connection'].isnull()]
    future_connections_okay = future_connections_mixed[~future_connections['Future Connection'].isnull()] 
    
    X_train = future_connections_okay.drop(['Future Connection'], axis = 1)
    
    remove_list = ['common_neigh', 'Jaccard_coef', 'Resource_allocation', 'preferential_attachment']
    Y_train = future_connections_okay.drop(remove_list, axis = 1)
    
    X_test = future_connections_missing.drop(['Future Connection'], axis = 1)
    
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    clf = MLPClassifier(hidden_layer_sizes = [100, 10],
                          alpha = 0.001,
                          random_state = 0,
                          solver = 'lbfgs',
                          verbose = 0)
    
    clf.fit(X_train_scaled, Y_train)
    
    y_proba_test = clf.predict_proba(X_test_scaled)[:,1]
    X_test['proba_scores'] = y_proba_test
    
    return X_test.iloc[:, -1]
def generate_positive_features():
    features = []
    count = 0
    print("Generating positive features......")
    for sample in positive_samples:
        if (count % 100 == 0):
            print(count)
        count += 1
        feature = []
        try:
            preds = nx.resource_allocation_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.jaccard_coefficient(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.preferential_attachment(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.cn_soundarajan_hopcroft(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.ra_index_soundarajan_hopcroft(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.within_inter_cluster(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            feature.append(1)  # label=1

        except:
            print("one error at: " + str(count))
            pass
        features.append(feature)
    print("positive features: " + str(len(features)))
    return features
Beispiel #10
0
def new_connections_predictions():
    # Your Code Here
    from sklearn import model_selection

    #creating features according to applications.py file
    common_neighbors = [
        len(list(nx.common_neighbors(G, edge[0], edge[1])))
        for edge in future_connections.index
    ]
    preferential_attachment = [
        item[2] for item in list(
            nx.preferential_attachment(G, ebunch=future_connections.index))
    ]
    adamic = [
        item[2] for item in list(
            nx.adamic_adar_index(G, ebunch=future_connections.index))
    ]

    future_connections['Common Neighbors'] = common_neighbors
    future_connections['Preferential Attachment'] = preferential_attachment
    future_connections['Adamic Adar Index'] = adamic
    future_connections.head(5)

    #split train and test sets to feed to classifier
    train_set = future_connections.dropna()
    test_set = future_connections[
        future_connections['Future Connection'].isnull()]
    X = train_set.iloc[:, 1:]
    y = train_set.iloc[:, 0]
    X_test = test_set.iloc[:, 1:]

    X_train, x_test, Y_train, y_test = model_selection.train_test_split(
        X, y, random_state=0)
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    #creating the model
    model = MLPClassifier(hidden_layer_sizes=[10, 5],
                          alpha=5,
                          random_state=0,
                          solver='lbfgs',
                          verbose=0)
    model.fit(X_train_scaled, Y_train)
    test_proba = model.predict_proba(X_test_scaled)[:, 1]
    prediction = pd.Series(test_proba, X_test.index)
    return prediction  # Your Answer Here
Beispiel #11
0
def new_connections_predictions():
    for node in G.nodes():
        G.node[node]['community'] = G.node[node]['Department']
    preferential_attachment = list(nx.preferential_attachment(G))
    df = pd.DataFrame(index=[(x[0], x[1]) for x in preferential_attachment])
    df['preferential_attachment'] = [x[2] for x in preferential_attachment]
    cn_soundarajan_hopcroft = list(nx.cn_soundarajan_hopcroft(G))
    df_cn_soundarajan_hopcroft = pd.DataFrame(
        index=[(x[0], x[1]) for x in cn_soundarajan_hopcroft])
    df_cn_soundarajan_hopcroft['cn_soundarajan_hopcroft'] = [
        x[2] for x in cn_soundarajan_hopcroft
    ]
    df = df.join(df_cn_soundarajan_hopcroft, how='outer')
    df['cn_soundarajan_hopcroft'] = df['cn_soundarajan_hopcroft'].fillna(
        value=0)
    df['resource_allocation_index'] = [
        x[2] for x in list(nx.resource_allocation_index(G))
    ]
    df['jaccard_coefficient'] = [x[2] for x in list(nx.jaccard_coefficient(G))]
    df = future_connections.join(df, how='outer')
    df_train = df[~pd.isnull(df['Future Connection'])]
    df_test = df[pd.isnull(df['Future Connection'])]
    features = [
        'cn_soundarajan_hopcroft', 'preferential_attachment',
        'resource_allocation_index', 'jaccard_coefficient'
    ]
    X_train = df_train[features]
    Y_train = df_train['Future Connection']
    X_test = df_test[features]
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = MLPClassifier(hidden_layer_sizes=[10, 5],
                        alpha=5,
                        random_state=0,
                        solver='lbfgs',
                        verbose=0)
    clf.fit(X_train_scaled, Y_train)
    test_proba = clf.predict_proba(X_test_scaled)[:, 1]
    predictions = pd.Series(test_proba, X_test.index)
    target = future_connections[pd.isnull(
        future_connections['Future Connection'])]
    target['prob'] = [predictions[x] for x in target.index]
    return target['prob']
Beispiel #12
0
def new_connections_predictions():
    
    
    from sklearn import preprocessing
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    df =future_connections;
    df['preferential attachment'] = [i[2] for i in nx.preferential_attachment(G, df.index)]

    df['common_neighbors'] = df.index.map(lambda edge: len(list(nx.common_neighbors(G, edge[0], edge[1]))))


    df_test = df[df['Future Connection'].isnull()]
    df_train = df[~df['Future Connection'].isnull()]

    X_notnull = df_train[['preferential attachment','common_neighbors']]

    y_notnull = df_train[['Future Connection']]

    X_pred = df_test[['preferential attachment','common_neighbors']]

    X_train, X_test, y_train, y_test = train_test_split(X_notnull, y_notnull, test_size=0.2)

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)  
    X_test = scaler.transform(X_test)  
    X_pred = scaler.transform(X_pred)  

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    #print('Training Score :\t', clf.score(X_train, y_train))
    #print('Test Score :\t\t', clf.score(X_test, y_test))

    y_pred = clf.predict_proba(X_pred)[:,1]

    #df_test.index

    df2 = pd.Series(y_pred)

    df2.index = df_test.index

    return df2
Beispiel #13
0
def new_connections_predictions():
    for node in G.nodes():
        G.node[node]["community"] = G.node[node]["Department"]
        
    preferential_attachment = list(nx.preferential_attachment(G))
    df_preferential_attachment = pd.DataFrame(index=[(x[0], x[1]) for x in preferential_attachment])
    df_preferential_attachment["preferential_attachment"] = [x[2] for x in preferential_attachment]
    
    cn_soundarajan_hopcroft = list(nx.cn_soundarajan_hopcroft(G))
    df_cn_soundarajan_hopcroft = pd.DataFrame(index=[(x[0], x[1]) for x in cn_soundarajan_hopcroft])
    df_cn_soundarajan_hopcroft["cn_soundarajan_hopcroft"] = [x[2] for x in cn_soundarajan_hopcroft]
    
    df = df_preferential_attachment.join(df_cn_soundarajan_hopcroft, how="outer")
    
    df["cn_soundarajan_hopcroft"] = df["cn_soundarajan_hopcroft"].fillna(value=0)
    df["resource_allocation_index"] = [x[2] for x in list(nx.resource_allocation_index(G))]
    df["jaccard_coefficient"] = [x[2] for x in list(nx.jaccard_coefficient(G))]
    
    df = future_connections.join(df, how="outer")
    
    df["Future Connection"] = df["Future Connection"].fillna(-1)
    future_connections["Future Connection"] = future_connections["Future Connection"].fillna(-1)
    
    features = ["cn_soundarajan_hopcroft", "preferential_attachment", "resource_allocation_index", "jaccard_coefficient"]
    X_train = df[df["Future Connection"]!=-1][features]
    y_train = df[df["Future Connection"]!=-1]["Future Connection"]
    X_test = df[df["Future Connection"]==-1][features]        
    
    scaler = MinMaxScaler()    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = MLPClassifier(alpha=5, random_state=0, solver="lbfgs").fit(X_train_scaled, y_train)
        
    predictions = clf.predict_proba(X_test_scaled)[:, 1]
    predictions_formated = pd.Series(predictions, X_test.index)
    
    result = future_connections[future_connections["Future Connection"]==-1]
    result["probability"] = [predictions_formated[x] for x in result.index]
    return result["probability"]
def stalker_evolution(M):
    # Common Neighbors
    CN = [(e[0], e[1], len(list(nx.common_neighbors(M, e[0], e[1]))))
          for e in nx.non_edges(M)]
    CN.sort(key=operator.itemgetter(2), reverse=True)

    # Jaccard coef
    jaccard = list(nx.jaccard_coefficient(M))
    jaccard.sort(key=operator.itemgetter(2), reverse=True)

    # Resource Allocation index
    RA = list(nx.resource_allocation_index(M))
    RA.sort(key=operator.itemgetter(2), reverse=True)

    # Adamic-Adar index
    AA = list(nx.adamic_adar_index(M))
    AA.sort(key=operator.itemgetter(2), reverse=True)

    # Preferential Attachement
    PA = list(nx.preferential_attachment(M))
    PA.sort(key=operator.itemgetter(2), reverse=True)

    # Community Common Neighbors !!! requires graph to have node attribute: 'community' !!!
    # CCN = list(nx.cn_soundarajan_hopcroft(M))
    # CCN.sort(key=operator.itemgetter(2), reverse = True)

    # Community Resource Allocation !!! requires graph to have node attribute: 'community' !!!
    # CRA = list(nx.ra_index_soundarajan_hopcroft(M))
    # CRA.sort(key=operator.itemgetter(2), reverse = True)

    # ###################### Prediction of future edge formation ####################

    FM = M
    for i in PA[0:int(0.1 * len(M.edges()))]:
        FM.add_edge(i[0], i[1], value='new')

    for i in CN[0:int(0.1 * len(M.edges()))]:
        FM.add_edge(i[0], i[1], value='new')

    return FM
Beispiel #15
0
def link_scores(graph, all_dfs, labels, g_undirected):
    lst = []
    lst2 = []
    predictions1 = nx.preferential_attachment(g_undirected, g_undirected.edges())

    [lst.append((u, v, p)) for u, v, p in predictions1]
    predictions1 = {(k, v): n for k, v, n in lst}

    all_dfs['Preferential_Attachment'] = all_dfs.apply(lambda x: map_predictions_to_df(predictions1, x), axis=1)

    predictions3 = nx.resource_allocation_index(g_undirected, g_undirected.edges())

    try:
        [lst2.append((u, v, p)) for u, v, p in predictions3]
        predictions3 = {(k, v): n for k, v, n in lst2}

        all_dfs['Resource_allocation'] = all_dfs.apply(lambda x: map_predictions_to_df(predictions3, x), axis=1)

    except ZeroDivisionError:
        print("ZeroDivisionError: float division by zero")

    return all_dfs
Beispiel #16
0
def extract_edge_feature(G, unG, head, tail, node_feat):
    '''
    featrue
    1. head node feature
    2. tail node featrue
    3. pmi
    4. num common successors
    5. num common predecessors
    6. num pred(head) & succ(tail)
    7. num common neighbor
    8. jaccard
    9. resource
    10. adamic
    11. has path
    '''
    head_feat = node_feat[head] if head in node_feat else [0] * 131
    tail_feat = node_feat[tail] if tail in node_feat else [0] * 131
    all_feat = head_feat + tail_feat
    if head not in G or tail not in G:
        return all_feat + [0] * 9

    all_feat.append(0 if
                    (head, tail not in PMI_dict) else PMI_dict[(head,
                                                                tail)])  #263
    all_feat.append(len(set(G.successors(head)) & set(G.successors(tail))))
    all_feat.append(len(set(G.predecessors(head)) & set(G.predecessors(tail))))
    all_feat.append(len(set(G.predecessors(head)) & set(G.successors(tail))))
    all_feat.append(len(set(nx.common_neighbors(unG, head, tail))))
    all_feat.append(list(nx.jaccard_coefficient(unG, [(head, tail)]))[0][2])
    all_feat.append(
        list(nx.resource_allocation_index(unG, [(head, tail)]))[0][2])
    all_feat.append(list(nx.adamic_adar_index(unG, [(head, tail)]))[0][2])
    all_feat.append(
        list(nx.preferential_attachment(unG, [(head, tail)]))[0][2])  #271

    #all_feat.append(eb_cent[(head, tail)])
    #all_feat.append(nx.has_path(G, head, tail))

    return all_feat
def new_connections_predictions():

    from sklearn.ensemble import GradientBoostingClassifier

    future_connections['pref_attachment'] = [
        list(nx.preferential_attachment(G, [node_pair]))[0][2]
        for node_pair in future_connections.index
    ]
    future_connections['comm_neighbors'] = [
        len(list(nx.common_neighbors(G, node_pair[0], node_pair[1])))
        for node_pair in future_connections.index
    ]
    train_data = future_connections[~future_connections['Future Connection'].
                                    isnull()]
    test_data = future_connections[
        future_connections['Future Connection'].isnull()]
    clf = GradientBoostingClassifier()
    clf.fit(train_data[['pref_attachment', 'comm_neighbors']].values,
            train_data['Future Connection'].values)
    preds = clf.predict_proba(test_data[['pref_attachment',
                                         'comm_neighbors']].values)[:, 1]
    return pd.Series(preds, index=test_data.index)
def new_connections_predictions():

    df = future_connections
    df['Department'] = [
        1. if G.node[connection[0]]['Department']
        == G.node[connection[1]]['Department'] else 0.
        for connection in future_connections.index
    ]
    df['pa'] = [
        i[2]
        for i in nx.preferential_attachment(G, ebunch=future_connections.index)
    ]
    df['cn'] = [
        len(set(nx.common_neighbors(G, connection[0], connection[1])))
        for connection in future_connections.index
    ]
    df_train = df.dropna()
    df_test = df[df['Future Connection'].isnull()]
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    #from sklearn.metrics import roc_auc_score
    #X_train, X_test, y_train, y_test = train_test_split(df_train[['Department', 'pa', 'cn']], df_train['Future Connection'])

    #from sklearn.ensemble import RandomForestClassifier
    #from sklearn.model_selection import cross_val_score
    #rfc = RandomForestClassifier(n_estimators=100, max_depth=3)
    #print(np.mean(cross_val_score(rfc, X_train, y_train, cv=5, scoring='roc_auc')))
    lr = LogisticRegression()
    #print(np.mean(cross_val_score(lr, X_train, y_train, cv=5, scoring='roc_auc')))
    #lr.fit(X_train, y_train.values.reshape(-1,1))
    # predict probabilities
    #lr_probs = model.predict_proba(X_test)[:, 1]
    #rfc = RandomForestClassifier(n_estimators=100, max_depth=5)
    lr.fit(df_train[['Department', 'pa', 'cn']], df_train['Future Connection'])
    lr_probs = lr.predict_proba(df_test[['Department', 'pa', 'cn']])[:, 1]
    result_series = pd.Series(lr_probs, index=df_test.index)

    return result_series
Beispiel #19
0
def get_all_proximity_score(G, edges):
    proximity_score_list = [[] for i in itertools.repeat(None, len(edges))]
    cc = [
        nx.square_clustering(G, edge[0]) + nx.square_clustering(G, edge[1])
        for edge in edges
    ]
    cn = [
        len(list(nx.common_neighbors(G, edge[0], edge[1]))) for edge in edges
    ]
    jc = nx.jaccard_coefficient(G, edges)
    pa = nx.preferential_attachment(G, edges)
    rai = nx.resource_allocation_index(G, edges)
    for i, data in enumerate(cc):
        proximity_score_list[i].append(data)
    for i, data in enumerate(cn):
        proximity_score_list[i].append(data)
    for i, data in enumerate(jc):
        proximity_score_list[i].append(data[2])
    for i, data in enumerate(pa):
        proximity_score_list[i].append(data[2])
    for i, data in enumerate(rai):
        proximity_score_list[i].append(data[2])
    return proximity_score_list
Beispiel #20
0
    def predict(self,):
        preds = None
        if self.algo_name == 'RAI':
            print('RAI')  # 0.707
            preds = nx.resource_allocation_index(self.G, self.edges_to_prediction)
        if self.algo_name == 'jaccard':
            print('jaccard') # 0.628
            preds = nx.jaccard_coefficient(self.G, self.edges_to_prediction)
        if self.algo_name == 'adamic_adar_index':
            print('adamic_adar_index') #0.687
            preds = nx.adamic_adar_index(self.G, self.edges_to_prediction )
        if self.algo_name == 'preferential_attachment':
            print('preferential_attachment') #0.498
            preds = nx.preferential_attachment(self.G,self.edges_to_prediction )

        if preds is None:
            raise ValueError('Algorithm was not found: %s Or something weird happened in prediction' % self.algo_name)

        predictions1 = [(i,v) for (v, i) in sorted([(p, (u, v)) for (u, v, p) in preds],
                                               reverse=True)]  # get the cells of matrix in ascending order of cell value
        print(1)
        predictions2 = predictions1  # the following is redundent here... #[t for t in predictions1 if t[0]<t[1]] # just upper half of the matrix and predictions larger than 0
        return predictions2
Beispiel #21
0
def similarities_matrices_calc(graphs):
    # for idx in range(len(graphs)):

    nodes = list(graphs.nodes)
    GD = {}
    CN = {}
    G = graphs.to_undirected()  # graph must be undirected in order for functions to work

    for first_node in nodes:
        for second_node in nodes:

            ## 6.1 find the common neighbors of nodes
            neighbors = []
            temp_neighbors = nx.common_neighbors(G, first_node, second_node)

            for p in temp_neighbors:
                neighbors.append(p)

            CN[first_node, second_node] = len(neighbors)

            # 6.2 find the graph distance
            try:
                distance = nx.shortest_path_length(G, first_node, second_node)
                GD[first_node, second_node] = distance
            except:
                continue

    # 6.3 find the jaccard coefficient
    jaccard = nx.jaccard_coefficient(G)

    # 6.4 find the adamic adar
    adamic = nx.adamic_adar_index(G)

    # 6.5 find the preferential attachment
    preferential = nx.preferential_attachment(G)

    return CN, GD, jaccard, adamic, preferential
Beispiel #22
0
def preferential_attachment_scores(g_train, train_test_split):
    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
        test_edges, test_edges_false = train_test_split # Unpack input

    start_time = time.time()
    pa_scores = {}

    # Calculate scores
    pa_matrix = np.zeros(adj_train.shape)
    for u, v, p in nx.preferential_attachment(
            g_train):  # (u, v) = node indices, p = Jaccard coefficient
        pa_matrix[u][v] = p
        pa_matrix[v][u] = p  # make sure it's symmetric
    pa_matrix = pa_matrix / pa_matrix.max()  # Normalize matrix

    runtime = time.time() - start_time
    pa_roc, pa_roc_curve, pa_ap = get_roc_score(test_edges, test_edges_false,
                                                pa_matrix)

    pa_scores['test_roc'] = pa_roc
    pa_scores['test_roc_curve'] = pa_roc_curve
    pa_scores['test_ap'] = pa_ap
    pa_scores['runtime'] = runtime
    return pa_scores
Beispiel #23
0
 def new_friends(self, G):
     """Creates new edges using the built in function nx.preferential_attachment to make the network dynamic. Only
     adds edges a percentage of the time, which depends on how high the preferential attachment value is.
     :param G= a networkx digraph
     :return G
     """
     H = G.to_undirected(
     )  #creates an undirected copy of the original graph
     n = nx.preferential_attachment(
         H
     )  #uses the preferential_attachment method from networkx to create friends
     for u, v, p in n:
         chance = random.randint(
             0,
             100)  #chance is a randomly generated number between 0 and 100
         if p >= len(
                 G.edges
         ) and chance >= 90:  #creates a new relationship (edge) between two nodes if their preferential
             G.add_edge(
                 u, v, weight=random.uniform(-1, 1)
             )  #attachment number is higher than the total number of edges and
         else:  #chance is greater than 90.
             continue
     return G
Beispiel #24
0
def get_features(L, flag):
    X = [[] for i in range(len(L))]

    #=====================Social features(user-to-user graph)======================

    #g0.adamic adar score
    if flag['g0'] is True:
        print("get feature g0")
        preds = nx.adamic_adar_index(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g1.jaccard coefficient
    if flag['g1'] is True:
        print("get feature g1")
        preds = nx.jaccard_coefficient(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1
    #g2.resource_allocation
    if flag['g2'] is True:
        print("get feature g2")
        preds = nx.resource_allocation_index(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g3.preferentail_attachment
    if flag['g3'] is True:
        print("get feature g3")
        preds = nx.preferential_attachment(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g4.shortest path length
    if flag['g4'] is True:
        print("get feature g4")
        cnt = 0
        for (u, v) in L:
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                if nx.has_path(G, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(G, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
                G.add_edge(u, v)
            else:
                if nx.has_path(G, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(G, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
            cnt += 1

    #g5.common neighbors
    if flag['g5'] is True:
        print("get feature g5")
        cnt = 0
        for (u, v) in L:
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                T = [w for w in nx.common_neighbors(G, u, v)]
                G.add_edge(u, v)
            else:
                T = [w for w in nx.common_neighbors(G, u, v)]
            X[cnt].append(len(T))
            cnt += 1

    #g6.Approximate katz for social graph
    if flag['g6'] is True:
        print("get feature g6")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for x in G.neighbors(u):
                    for y in G.neighbors(v):
                        if x == y or G.has_edge(x, y):
                            p += 1
                G.add_edge(u, v)
            else:
                for x in G.neighbors(u):
                    for y in G.neighbors(v):
                        if x == y or G.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    if flag['g7'] is True:
        print("get feature g7")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g8'] is True:
        print("get feature g8")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                if line == "":
                    continue
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g9'] is True:
        print("get feature g9")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.within_inter_cluster(G, L, delta=0.5)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g10'] is True:
        print("get feature g10")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.cn_soundarajan_hopcroft(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['g11'] is True:
        print("get feature g11")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.ra_index_soundarajan_hopcroft(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['g12'] is True:
        print("get feature g12")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.within_inter_cluster(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.within_inter_cluster(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds
    #=========================checkin features=========================================
    #c0.follower number
    if flag['c0'] is True:
        print("get feature c0")
        cnt = 0
        for (u, v) in L:
            X[cnt].append(U[u]['follow_cnt'] * U[v]['follow_cnt'])  # fu*fv
            cnt += 1

    #c1.same time same location
    if flag['c1'] is True:
        print("get feature c1")
        cnt = 0
        for (u, v) in L:
            p = calculate_CCC(G, u, v)
            X[cnt].append(p)
            cnt += 1

    #c2.same time same distinct spot
    if flag['c2'] is True:
        print("get deature c2")
        cnt = 0
        for (u, v) in L:
            p = 0
            dis_same_spot = []
            for k in C[u]:
                if k[1] not in dis_same_spot and k in C[v]:
                    dis_same_spot.append(k[1])
                    p += 1
            X[cnt].append(p)
            cnt += 1

    #c3.same distinct spot (not necessarily same time)
    if flag['c3'] is True:
        cnt = 0
        print("get feature c3")
        for (u, v) in L:
            p = 0
            dis_same_spot = []
            for k in C[u]:
                if k[1] not in dis_same_spot:
                    for m in C[v]:
                        if k[1] == m[1]:
                            dis_same_spot.append(k[1])
                            p += 1
                            break
            X[cnt].append(p)
            cnt += 1

    #c4.min Entropy
    if flag['c4'] is True:
        print("get feature c4")
        cnt = 0
        for (u, v) in L:
            p = 0
            E_list = []
            for k in C[u]:
                if k in C[v]:
                    spot = k[1]
                    if spot in S and S[spot]['entropy'] > 0:
                        E_list.append(S[spot]['entropy'])
            if len(E_list) > 0:
                p = min(E_list)
            X[cnt].append(p)
            cnt += 1

    #c5. distance of mean_LL
    if flag['c5'] is True:
        cnt = 0
        print("get feature c5")
        for (u, v) in L:
            dist = np.sqrt((U[u]['mean_LL'][0] - U[v]['mean_LL'][0])**2 +
                           (U[u]['mean_LL'][1] - U[v]['mean_LL'][1])**2)
            X[cnt].append(dist)
            cnt += 1

    #c6.weighted same location
    if flag['c6'] is True:
        print("get feature c6")
        cnt = 0
        for (u, v) in L:
            p = 0
            for k in C[u]:
                if k in C[v]:
                    spot = k[1]
                    #if spot in S and S[spot]['entropy'] > 0:
                    #p += 1/S[spot]['entropy']
                    if spot in S:
                        dist = np.sqrt(
                            (S[spot]['LL'][0] - U[u]['mean_LL'][0])**2 +
                            (S[spot]['LL'][1] - U[u]['mean_LL'][1])**2)
                        p += dist
                        dist = np.sqrt(
                            (S[spot]['LL'][0] - U[v]['mean_LL'][0])**2 +
                            (S[spot]['LL'][1] - U[v]['mean_LL'][1])**2)
                        p += dist
            X[cnt].append(p)
            cnt += 1

    #c7.PP
    if flag['c7'] is True:
        print("get feature c7")
        cnt = 0
        for (u, v) in L:
            p = len(C[u]) * len(C[v])
            X[cnt].append(p)
            cnt += 1

    #c8.Total Common Friend Closeness (TCFC)
    if flag['c8'] is True:
        print("get feature c8")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for w in nx.common_neighbors(G, u, v):
                    T1 = [x for x in nx.common_neighbors(G, u, w)]
                    T2 = [x for x in nx.common_neighbors(G, v, w)]
                    p += len(T1) * len(T2)
                G.add_edge(u, v)
            else:
                for w in nx.common_neighbors(G, u, v):
                    T1 = [x for x in nx.common_neighbors(G, u, w)]
                    T2 = [x for x in nx.common_neighbors(G, v, w)]
                    p += len(T1) * len(T2)
            X[cnt].append(p)
            cnt += 1

    #c9.Total Common friend Checkin Count (TCFCC)
    if flag['c9'] is True:
        print("get feature c9")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for w in nx.common_neighbors(G, u, v):
                    p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w)
                G.add_edge(u, v)
            else:
                for w in nx.common_neighbors(G, u, v):
                    p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w)
            X[cnt].append(p)
            cnt += 1

    #c10. Common Category Checkin Counts Product (CCCP)
    if flag['c10'] is True:
        print("get feature c10")
        cnt = 0
        for (u, v) in L:
            p = 0
            for cat in U[u]['cate']:
                if cat in U[v]['cate']:
                    p += U[u]['cate'][cat] * U[v]['cate'][cat]
            X[cnt].append(p)
            cnt += 1

    #c11. Common Category Checkin Counts Product Ratio(CCCPR)
    if flag['c11'] is True:
        print("get feature c11")
        cnt = 0
        for (u, v) in L:
            p = 0
            u_cate_total = sum(U[u]['cate'][cat]**2 for cat in U[u]['cate'])
            v_cate_total = sum(U[v]['cate'][cat]**2 for cat in U[v]['cate'])
            for cat in U[u]['cate']:
                if cat in U[v]['cate']:
                    p += (U[u]['cate'][cat] * U[v]['cate'][cat] /
                          np.sqrt(u_cate_total * v_cate_total))
            X[cnt].append(p)
            cnt += 1

#c12.trip route length all
    if flag['c12'] is True:
        print("get feature c12")
        cnt = 0
        for (u, v) in L:
            tripDayLen1 = list()
            tripDayLen2 = list()
            tripDay = "starting"
            tripLen = 0.0
            lastSpot = [0.0, 0.0]
            for k in C[u]:
                if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0):
                    if k[1] in S:
                        tripLen += np.sqrt((lastSpot[0] -
                                            S[k[1]]['LL'][0])**2 +
                                           (lastSpot[1] - S[k[1]]['LL'][1])**2)
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
                else:
                    if k[1] in S:
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
            tripDay = "starting"
            tripLen2 = 0.0
            lastSpot = [0.0, 0.0]
            for k in C[v]:
                if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0):
                    if k[1] in S:
                        tripLen2 += np.sqrt(
                            (lastSpot[0] - S[k[1]]['LL'][0])**2 +
                            (lastSpot[1] - S[k[1]]['LL'][1])**2)
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
                else:
                    if k[1] in S:
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
            X[cnt].append(tripLen + tripLen2)
            cnt += 1

    #=========================Heter Graph features=====================================

    #h0.Approximate katz for bipartite graph
    if flag['h0'] is True:
        print("get feature h0")
        cnt = 0
        for (u, v) in L:
            p = 0
            for x in B.neighbors(u):
                for y in B.neighbors(v):
                    if x == y or B.has_edge(x, y):
                        p += 1
            X[cnt].append(p)
            cnt += 1

    #h1.Approximate katz on HB
    if flag['h1'] is True:
        print("get feature h1")
        cnt = 0
        for (u, v) in L:
            p = 0
            if HB.has_edge(u, v):
                HB.remove_edge(u, v)
                for x in HB.neighbors(u):
                    for y in HB.neighbors(v):
                        if x == y or HB.has_edge(x, y):
                            p += 1
                HB.add_edge(u, v)
            else:
                for x in HB.neighbors(u):
                    for y in HB.neighbors(v):
                        if x == y or HB.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #h2.Approximate katz on H
    if flag['h2'] is True:
        print("get feature h2")
        cnt = 0
        for (u, v) in L:
            p = 0
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
                H.add_edge(u, v)
            else:
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #h3.shortest path length on B
    if flag['h3'] is True:
        print("get feature h3")
        cnt = 0
        for (u, v) in L:
            if nx.has_path(B, u, v):
                X[cnt].append(
                    nx.shortest_path_length(B, source=u, target=v) / 50000)
            else:
                X[cnt].append(1)
            cnt += 1

    #h4.clustering coefiicient on H
    if flag['h4'] is True:
        print("get feature h4")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                p = nx.clustering(H, u) * nx.clustering(H, v)
                H.add_edge(u, v)
            else:
                p = nx.clustering(H, u) * nx.clustering(H, v)
            X[cnt].append(p)
            cnt += 1

    #h5. number of (user's loc friends)'s loc friends
    if flag['h5'] is True:
        print("get feature h5")
        cnt = 0
        for (u, v) in L:
            counter1 = 0
            for neighbor in H.neighbors(u):
                if not neighbor.isnumeric():
                    for neighbor2 in H.neighbors(neighbor):
                        if not neighbor.isnumeric():
                            counter1 += 1
            counter2 = 0
            for neighbor in H.neighbors(v):
                if not neighbor.isnumeric():
                    for neighbor2 in H.neighbors(neighbor):
                        if not neighbor.isnumeric():
                            counter2 += 1

            #print(str(counter1)+" "+str(counter2)+"\n")
            X[cnt].append(counter1 * counter2)
            cnt += 1

    #h6. location friends' degree sum
    if flag['h6'] is True:
        print("get feature h6")
        cnt = 0
        for (u, v) in L:
            counter1 = 0
            for locationNeighbor in H.neighbors(u):
                if not locationNeighbor.isnumeric():
                    #print(str(locationNeighbor)+"\n")
                    if locationNeighbor in LG:
                        counter1 += LG.degree(locationNeighbor)

            counter2 = 0
            for locationNeighbor in H.neighbors(v):
                if not locationNeighbor.isnumeric():
                    if locationNeighbor in LG:
                        counter2 += LG.degree(locationNeighbor)
            X[cnt].append(counter1 * counter2)
            cnt += 1

    #h7. Approximate katz for social graph
    if flag['h7'] is True:
        print("get feature h7")
        cnt = 0
        for (u, v) in L:
            counter = 0
            for node in H.neighbors(u):
                if not node.isnumeric():
                    for node2 in H.neighbors(v):
                        if not node2.isnumeric():
                            if node == node2 or H.has_edge(node, node2):
                                counter += 1
            X[cnt].append(counter)
            cnt += 1

    #h8. adamic adar score on H
    if flag['h8'] is True:
        print("get feature h8")
        preds = nx.adamic_adar_index(H, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1
    #h9. resource_allocation on H
    if flag['h9'] is True:
        print("get feature h9")
        preds = nx.resource_allocation_index(H, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #h10. shortest path length on H
    if flag['h10'] is True:
        print("get feature h10")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                if nx.has_path(H, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(H, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
                H.add_edge(u, v)
            else:
                if nx.has_path(H, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(H, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
            cnt += 1
    #h11. common neighbors on H
    if flag['h11'] is True:
        print("get feature h11")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                T = [w for w in nx.common_neighbors(H, u, v)]
                H.add_edge(u, v)
            else:
                T = [w for w in nx.common_neighbors(H, u, v)]
            X[cnt].append(len(T))
            cnt += 1

    #h12.Approximate katz for social graph
    if flag['h12'] is True:
        print("get feature h12")
        cnt = 0
        for (u, v) in L:
            p = 0
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
                H.add_edge(u, v)
            else:
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    if flag['h13'] is True:
        print("get feature h13")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h14'] is True:
        print("get feature h14")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                if line == "":
                    continue
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h15'] is True:
        print("get feature h15")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.within_inter_cluster(HB, L, delta=0.5)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h16'] is True:
        print("get feature h16")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.cn_soundarajan_hopcroft(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['h17'] is True:
        print("get feature h17")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['h18'] is True:
        print("get feature h18")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.within_inter_cluster(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.within_inter_cluster(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    return X
Beispiel #25
0
# ### Extracting attributes
#
# Using `nx.get_edge_attributes`, it's easy to extract the edge attributes in the graph into DataFrame columns.

# In[ ]:

df['weight'] = pd.Series(nx.get_edge_attributes(G, 'weight'))

df

# ### Creating edge based features
#
# Many of the networkx functions related to edges return a nested data structures. We can extract the relevant data using list comprehension.

# In[ ]:

df['preferential attachment'] = [
    i[2] for i in nx.preferential_attachment(G, df.index)
]

df

# In the case where the function expects two nodes to be passed in, we can map the index to a lamda function.

# In[ ]:

df['Common Neighbors'] = df.index.map(
    lambda city: len(list(nx.common_neighbors(G, city[0], city[1]))))

df
def similarity_matrices(edges, nodes):  # Calculates the similarity matrices
    if len(nodes) == 0:
        print('V* is empty, skipping to next t.')
        return -1
    g = nx.DiGraph()
    g.add_edges_from(edges)
    ung = nx.Graph(g)

    gd = zeros((len(nodes), len(nodes)))
    cn = zeros((len(nodes), len(nodes)))
    jc = zeros((len(nodes), len(nodes)))
    a = zeros((len(nodes), len(nodes)))
    pa = zeros((len(nodes), len(nodes)))
    for i in range(len(nodes)):
        for j in range(len(nodes)):
            try:
                gd[i][j] = nx.shortest_path_length(g, nodes[i], nodes[j])
            except nx.NetworkXNoPath:
                gd[i][j] = -1
                pass
            except nx.NodeNotFound:
                gd[i][j] = -1
                pass
            try:
                cn[i][j] = len(
                    sorted(nx.common_neighbors(ung, nodes[i], nodes[j])))
            except nx.NetworkXError:
                cn[i][j] = -1
                pass
            try:
                for u, v, p in nx.jaccard_coefficient(ung,
                                                      [(nodes[i], nodes[j])]):
                    jc[i][j] = p
            except:
                jc[i][j] = -1
                pass
            try:
                for u, v, p in nx.adamic_adar_index(ung,
                                                    [(nodes[i], nodes[j])]):
                    a[i][j] = p
            except ZeroDivisionError:
                a[i][j] = -1
                pass
            except nx.NetworkXError:
                a[i][j] = -1
                pass
            try:
                for u, v, p in nx.preferential_attachment(
                        ung, [(nodes[i], nodes[j])]):
                    pa[i][j] = p
            except:
                pa[i][j] = -1
                pass
    ####
    k = 0
    for par in parameter_list:
        ind_list = []
        if k == 0:
            ref = gd
            t = 'Pgd'
        elif k == 1:
            ref = cn
            t = 'Pcn'
        elif k == 2:
            ref = jc
            t = 'Pjc'
        elif k == 3:
            ref = a
            t = 'Pa'
        else:
            ref = pa
            t = 'Ppa'

        for i in range(par):
            flat_ind = argmax(ref)
            dim_ind = tuple((flat_ind // len(nodes), flat_ind % len(nodes)))
            ref[dim_ind[0]][dim_ind[1]] = -1
            ind_list.append(dim_ind)
        cnt = 0
        for j in ind_list:
            if tuple((nodes[j[0]], nodes[j[1]])) in edges or tuple(
                (nodes[j[1]], nodes[j[0]])) in edges:
                cnt += 1
        k += 1
        print(t, cnt / par)
Beispiel #27
0
# In[9]:


df['weight'] = pd.Series(nx.get_edge_attributes(G, 'weight'))

df


# ### Creating edge based features
# 
# Many of the networkx functions related to edges return a nested data structures. We can extract the relevant data using list comprehension.

# In[10]:


df['preferential attachment'] = [i[2] for i in nx.preferential_attachment(G, df.index)]

df


# In the case where the function expects two nodes to be passed in, we can map the index to a lamda function.

# In[11]:


df['Common Neighbors'] = df.index.map(lambda city: len(list(nx.common_neighbors(G, city[0], city[1]))))

df


# In[ ]:
        print('Reading %s_topological_network.csv...' %
              (prog_languages[prog_lang_id]))
        t_network = []

        for row in data:
            dev_id_1 = int(row[0])
            dev_id_2 = int(row[1])

            t_network.append((dev_id_1, dev_id_2))
    csvfile.close()

    with open('../Files/topological_metrics.csv', 'a') as a:
        metrics_file = csv.writer(a, delimiter=',')

        print('Writing topological metrics for', prog_languages[prog_lang_id])

        for dev_pair in t_network:
            neighborhood_overlap = nx.jaccard_coefficient(G, [dev_pair])
            adamic_acar = nx.adamic_adar_index(G, [dev_pair])
            preferential_attachment = nx.preferential_attachment(G, [dev_pair])

            for u, v, p in neighborhood_overlap:
                NO = p
            for u, v, p in adamic_acar:
                AA = p
            for u, v, p in preferential_attachment:
                PA = p

            metrics_file.writerow(
                [prog_lang_id, dev_pair[0], dev_pair[1], NO, AA, PA])
    a.close()
common_neighbors = np.zeros(n)

# computing features for training set
for i in tqdm(range(len(id1))):
    if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
        G.remove_edge(id1[i], id2[i])

    pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])])
    pred = [(u, v, p) for (u, v, p) in pred]
    jaccard[i] = pred[0][2]

    pred = nx.adamic_adar_index(G, [(id1[i], id2[i])])
    pred = [(u, v, p) for (u, v, p) in pred]
    adar[i] = pred[0][2]

    pred = nx.preferential_attachment(G, [(id1[i], id2[i])])
    pred = [(u, v, p) for (u, v, p) in pred]
    preferential_attachment[i] = pred[0][2]

    pred = nx.resource_allocation_index(G, [(id1[i], id2[i])])
    pred = [(u, v, p) for (u, v, p) in pred]
    resource_allocation_index[i] = pred[0][2]

    pred = nx.common_neighbors(G, id1[i], id2[i])
    pred = len([u for u in pred])
    common_neighbors[i] = pred

    if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
        G.add_edge(id1[i], id2[i])

# add feature to data-frame
Beispiel #30
0
                                    
    
            n = nx.number_of_nodes(g)
            e = nx.number_of_edges(g)
            d = nx.degree_histogram(g)
            t = nx.transitivity(g)
            kc = nx.core_number(g)
            nx.set_node_attributes(g,'k_core',kc)
            dc=nx.degree_centrality(g)
            nx.set_node_attributes(g,'dc',dc)
            
#            nihs=nx.get_edge_attributes(g,'nih')
            print "Graph has %d nodes and %d edges and %f transitivity" %(n, e, t)
            
            #grab initial pairs
            pairs = list(nx.preferential_attachment(g))
            pairs += list(nx.preferential_attachment(g,g.edges()))            
            
            yr =  int(str(year)[0:4])
            for pair in pairs:
                x,y,p = pair 
                xyr = int(str(g.node[x]['fyr'])[0:4]) if 'fyr' in g.node[x].keys() else None
                yyr = int(str(g.node[y]['fyr'])[0:4]) if 'fyr' in g.node[y].keys() else None
                xft = int(str(g.node[x]['firsttie'])[0:4]) if 'firsttie' in g.node[x].keys() else None
                yft = int(str(g.node[y]['firsttie'])[0:4]) if 'firsttie' in g.node[y].keys() else None
                c = len(list(nx.common_neighbors(g,x,y)))
                row = [yr,str(x)+":"+str(y),int(g.has_edge(x,y)),
                       g.node[x]['dc'], 
                       g.node[y]['dc'],
                       yr-xft if xft > 0 and xft < yr else 0,
                       (1 if yr==yyr else 0) if yyr != None else None,
Beispiel #31
0
def link_prediction(G, query_nodes, target_nodes, n_edges, start_dist, alg = "ra"):
    """Selects a random set of links between based on the scores calculated by 
    a standard link-prediction algorithm from networkx library
    Parameters
    ----------
    G : Networkx graph
        The graph from which the team will be selected.
    query : list 
        The set of nodes from which random walker starts.
    target : list
        The set of nodes from where the random walker ends.
    n_edges : integer
        the number of links to be added
    start_dist: list
        The starting distribution over the query set
    alg: string
        A string describing the link-prediction algorithm to be used
    Returns
    -------
    links : list
        The set of links that reduce the absorbing RW centrality
    ac_scores: list
        The set of scores of adding the links
    """
    assert alg in ["ra", "pa", "jaccard", "aa"], "alg must be one of [\"ra\", \"pa\", \"jaccard\", \"aa\"]."
          
    H = G.copy()
    query_set_size = len(query_nodes)
    map_query_to_org = dict(zip(query_nodes, range(query_set_size)))
    P = csc_matrix(nx.google_matrix(H, alpha=1))
    P_abs = P[list(query_nodes),:][:,list(query_nodes)]
    F = compute_fundamental(P_abs)
    row_sums = start_dist.dot(F.sum())[0,0]
    candidates = list(product(query_nodes, target_nodes))
    eligible = [candidates[i] for i in range(len(candidates)) 
                if H.has_edge(candidates[i][0], candidates[i][1]) == False]
    links_to_add = []
    if alg == 'ra':
        preds = nx.resource_allocation_index(H, eligible)
    elif alg == 'jaccard':
        preds = nx.jaccard_coefficient(H, eligible)
    elif alg == 'aa':
        preds = nx.adamic_adar_index(H, eligible)
    elif alg == 'pa':
        preds = nx.preferential_attachment(H, eligible)
        
    for u,v,p in preds:
        links_to_add.append((u,v,p))
    links_to_add.sort(key=lambda x: x[2], reverse = True)
    
    ac_scores = []
    ac_scores.append(row_sums)
    i = 0
    while i < n_edges:
        F_updated = update_fundamental_mat(F, H, map_query_to_org, links_to_add[i][0])
        H.add_edge(links_to_add[i][0], links_to_add[i][1])
        abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0]
        F = F_updated            
        ac_scores.append(abs_cen)
        i += 1
    return links_to_add, ac_scores
Beispiel #32
0
CN.sort(key=operator.itemgetter(2), reverse=True)

# Jaccard coef
jaccard = list(nx.jaccard_coefficient(M))
jaccard.sort(key=operator.itemgetter(2), reverse=True)

# Resource Allocation index
RA = list(nx.resource_allocation_index(M))
RA.sort(key=operator.itemgetter(2), reverse=True)

# Adamic-Adar index
AA = list(nx.adamic_adar_index(M))
AA.sort(key=operator.itemgetter(2), reverse=True)

# Preferential Attachement
PA = list(nx.preferential_attachment(M))
PA.sort(key=operator.itemgetter(2), reverse=True)

# Community Common Neighbors !!! requires graph to have node attribute: 'community' !!!
#CCN = list(nx.cn_soundarajan_hopcroft(M))
#CCN.sort(key=operator.itemgetter(2), reverse = True)

# Community Resource Allocation !!! requires graph to have node attribute: 'community' !!!
#CRA = list(nx.ra_index_soundarajan_hopcroft(M))
#CRA.sort(key=operator.itemgetter(2), reverse = True)

# ###################### Prediction on Future Edge Linkage ####################

FM = M
for i in PA[0:int(0.1 * len(M.edges()))]:
    FM.add_edge(i[0], i[1], value='new')
#4 Create seed and time tracker 
random.seed(0)
t1 = datetime.now()

#5 Create a test set of 95 percent from G
edges_to_remove_from_twt = random.sample(twt.edges(), int(0.05 * twt.number_of_edges()))
twt_test = twt.copy()
twt_test.remove_edges_from(edges_to_remove_from_twt)
print("Number of edges deleted : %d" % len(edges_to_remove_from_twt))
print("Number of edges remaining : %d" % (twt_test.number_of_edges()))

#6 Transform twt_test to undirected
twt_test = twt_test.to_undirected()

#7 Calculate JC and PA AUC as features for negative edges
pred_jc_test_neg = list(nx.preferential_attachment(twt_test))
pred_pa_test_neg = list(nx.jaccard_coefficient(twt_test))

#8 Calculate JC and PA AUC as features for positive edges
pred_jc_test_pos = list(nx.preferential_attachment(twt_test,twt_test.edges()))
pred_pa_test_pos = list(nx.jaccard_coefficient(twt_test, twt_test.edges()))

#9 Combine negative and positive predictions
pred_jc_test_total = (pred_jc_test_neg + pred_jc_test_pos)
pred_pa_test_total = (pred_pa_test_neg + pred_pa_test_pos)
print("Number of negative edges : %d" % len(pred_jc_test_neg))
print("Number of positive edges : %d" % len(pred_jc_test_pos))

#[2] Dataframe================================================================

#1 Create score dataframe df
Beispiel #34
0
def sort_edges_by_preferential_attachment(graph, edges):
    edges_sorted = sorted(list(nx.preferential_attachment(
        graph, edges)), key=lambda l: l[2], reverse=True, cmp=compare_with_ties)
    return [(row[0], row[1]) for row in edges_sorted], [row[2] for row in edges_sorted]