Esempio n. 1
0
def node2vec_embedding(graph, name):
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)
    print(f"Number of random walks for '{name}': {len(walks)}")

    model = Word2Vec(
        walks,
        size=dimensions,
        window=window_size,
        min_count=0,
        sg=1,
        workers=workers,
        iter=num_iter,
    )

    def get_embedding(u):
        #print(len(model.wv[u]),u)
        if u == '16039':
            return np.ndarray(128)
        elif u == '24601':
            return np.ndarray(128)
        elif u == '21450':
            return np.ndarray(128)
        elif u == '12492':
            return np.ndarray(128)
        elif u == '6506':
            return np.ndarray(128)
        elif u == '1545':
            return np.ndarray(128)
        return model.wv[u]

    return get_embedding
    def walks(self, walklen, n1):
        G = nx.Graph()
        G = nx.read_weighted_edgelist(self.dataset + "/krnmdata1/CQAG1.txt")

        rw = BiasedRandomWalk(StellarGraph(G))

        weighted_walks = rw.run(
            nodes=G.nodes(),  # root nodes
            length=walklen,  # maximum length of a random walk
            n=n1,  # number of random walks per root node 
            p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
            q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
            weighted=True,  #for weighted random walks
            seed=42  # random seed fixed for reproducibility
        )
        print("Number of random walks: {}".format(len(weighted_walks)))
        #print(weighted_walks[0:10])

        #remove answer nodes
        walks = []
        for i in range(len(weighted_walks)):
            walk = weighted_walks[i]
            w = []
            for node in walk:
                if int(node) < self.qnum:
                    w.append(node)
                elif int(node) > (self.qnum + self.anum):
                    n = int(node) - self.anum
                    w.append(str(n))
            walks.append(w)
        print(walks[0:10])
        return walks
    def learn_embeddings(self,
                         embedding_dim=100,
                         window_size=5,
                         max_rw_len=50,
                         walks_per_node=20,
                         p=0.5,
                         q=2.0):
        print('Running node2vec...')
        rw = BiasedRandomWalk(StellarGraph(self.graph))
        walks = rw.run(nodes=list(self.graph),
                       length=max_rw_len,
                       n=walks_per_node,
                       p=p,
                       q=q)
        print(f'Number of random walks: {len(walks)}')

        print('Running word2vec...')
        model = Word2Vec(walks,
                         size=embedding_dim,
                         window=window_size,
                         min_count=0,
                         sg=1,
                         workers=2,
                         iter=1)
        model.init_sims(replace=True)

        return model.wv
Esempio n. 4
0
def node2vec_embedding(graph, name, weighted=False):
    
    p = 1.0
    q = 1.0
    dimensions = 128
    num_walks = 10
    walk_length = 80
    window_size = 10
    num_iter = 1
    workers = multiprocessing.cpu_count()
    
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), 
                   n=num_walks, 
                   length=walk_length, 
                   p=p, 
                   q=q, 
                   weighted=weighted)
    
    print(f"Number of random walks for '{name}': {len(walks)}")

    model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0,
                     sg=1, workers=workers, iter=num_iter) 
    
    def get_embedding(u):
        return model.wv[u]

    return get_embedding
def node2vec_embedding(graph):

    p = 1.0
    q = 1.0
    dimensions = 128
    num_walks = 10
    walk_length = 80
    window_size = 10
    num_iter = 1
    workers = multiprocessing.cpu_count()

    graph = StellarGraph(graph)
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)

    print(f"Number of random walks: {len(walks)}")

    model = Word2Vec(walks,
                     size=dimensions,
                     window=window_size,
                     min_count=0,
                     sg=1,
                     workers=workers,
                     iter=num_iter)

    features = pd.DataFrame(data=model.wv.vectors, index=list(graph.nodes()))
    features.index = features.index.map(str)

    return features
def graph_embed():
    combine = get_combine()
    li = ['bank', 'acquirer', 'coin', 'mcc', 'shop', 'nation', 'city']
    d = {
        'bank': 'b',
        'mcc': 'm',
        'acquirer': 'a',
        'coin': 'c',
        'shop': 's',
        'nation': 'n',
        'city': 'z'
    }
    have_df = False
    df_all = None

    for col_a in li:
        combine[col_a] = combine[col_a].astype(str) + [d[col_a]]

    for index, col_a in enumerate(li[1:]):
        print(f'{col_a} started..')
        walk_all = []
        for day in np.linspace(1, 120, 120):
            print(day, end=',', flush=True)
            df = combine[combine['date'] == day]
            G = construct_graph('bank', col_a, df)
            rw = BiasedRandomWalk(StellarGraph(G))
            walk = rw.run(
                nodes=list(G.nodes()),  # root nodes
                length=80,  # maximum length of a random walk
                n=1,  # number of random walks per root node 
                p=1,  # Defines (unormalised) probability, 1/p, of returning to source node
                q=1,  # Defines (unormalised) probability, 1/q, for moving away from source node
            )
            walk_all.extend(walk)
            del df, G, rw, walk
            gc.collect()

        model = Word2Vec(walk_all,
                         size=5,
                         window=3,
                         min_count=1,
                         sg=0,
                         workers=16,
                         iter=10)
        temp_d = {}
        for w in list(model.wv.vocab):
            temp_d[w] = model[w]
        temp_df = pd.DataFrame(
            data=combine[col_a].map(temp_d).tolist(),
            columns=['embed_bank_' + col_a + str(x + 1) for x in range(5)])
        if (have_df):
            df_all = pd.concat([df_all, temp_df], axis=1)
        else:
            df_all = temp_df
            have_df = True
        del temp_d, model
        gc.collect()
    return df_all
Esempio n. 7
0
def _fit_node2vec(train_graph, params, edge_weight=None):
    rw = BiasedRandomWalk(train_graph)
    walks = rw.run(
        nodes=list(train_graph.nodes()),  # root nodes
        length=params["length"],
        n=params["number_of_walks"],
        p=params["random_walk_p"],
        q=params["random_walk_q"],
        weighted=edge_weight is not None
    )
    model = Word2Vec(walks, size=params["embedding_dimension"])
    return model.wv[train_graph.nodes()]
Esempio n. 8
0
 def walks(self,walklen):
     G=nx.Graph();
     G=nx.read_weighted_edgelist(self.dataset+"/krnmdata1/teamsG.txt")        
     rw = BiasedRandomWalk(StellarGraph(G))
     weighted_walks = rw.run(
     nodes=G.nodes(), # root nodes
     length=walklen,    # maximum length of a random walk
     n=5,          # number of random walks per root node 
     p=0.1,         # Defines (unormalised) probability, 1/p, of returning to source node
     q=2.0,         # Defines (unormalised) probability, 1/q, for moving away from source node
     weighted=True, #for weighted random walks
     seed=42        # random seed fixed for reproducibility
     )
     print("Number of random walks: {}".format(len(weighted_walks)))
     print(weighted_walks[0:10])               
     return weighted_walks      
Esempio n. 9
0
def node2vec_embedding(graph, name):
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)
    print(f"Number of random walks for '{name}': {len(walks)}")

    model = Word2Vec(
        walks,
        size=dimensions,
        window=window_size,
        min_count=0,
        sg=1,
        workers=workers,
        iter=num_iter,
    )

    def get_embedding(u):
        return model.wv[u]

    return get_embedding
Esempio n. 10
0
    def __init__(self, edges_path, lables_path):
        """
        Hard-coded initialization
        """
        fstar = 1
        a = 0.125 # p, q left bound
        b = 4.125 # p, q right bound
        graph, labels = self.read_data(edges_path, lables_path)
        rw = BiasedRandomWalk(StellarGraph.from_networkx(graph))

        super().__init__(fstar, a, b, graph, labels, rw)
Esempio n. 11
0
def node2vec_walk(G, params):
    """Performs biased random walks using StellarGraph to generate corpus used in node2vec and writes corpus to a txt file 
    
    :param G : StellarGraph graph 
    Nodes consist of apps, api calls, packages, and invoke methods
    
    :param params : dict
    dict["key"] where dict is global parameter dictionary and key returns node2vec parameter sub-dictionary
    """
    start_walks = time.time()
    print("Starting Random Walks")

    rw = BiasedRandomWalk(G)
    fp = os.path.join(params["save_dir"], params["filename"])
    os.makedirs(params["save_dir"], exist_ok=True)

    walks = rw.run(
        nodes=list(G.nodes(node_type="app_nodes")),  # root nodes
        length=params["length"],  # maximum length of a random walk
        n=params["n"],  # number of random walks per root node
        p=params["p"],  # Defines prob, 1/p, of returning to source node
        q=params["q"],  # Defines prob, 1/q, for moving away from source node
    )
    print("--- Done Walking in " + str(int(time.time() - start_walks)) +
          " Seconds ---")
    print()
    print("Number of random walks: {}".format(len(walks)))

    # save walks to file
    with open(fp, 'w') as f:
        for walk in walks:
            for node in walk:
                f.write(str(node) + ' ')
            f.write('\n')
    f.close()

    if params["verbose"]:
        print("Saved %s to %s" % (params["filename"], params["save_dir"]))

    return
Esempio n. 12
0
        def get_node_feats(adj):  # input is cur_adj

            edgelist = adj['idx'].cpu().data.numpy()
            source = edgelist[:, 0]
            target = edgelist[:, 1]
            weight = np.ones(len(source))

            G = pd.DataFrame({
                'source': source,
                'target': target,
                'weight': weight
            })
            G = StellarGraph(edges=G)
            rw = BiasedRandomWalk(G)

            weighted_walks = rw.run(
                nodes=list(G.nodes()),  # root nodes
                length=2,  # maximum length of a random walk
                n=5,  # number of random walks per root node
                p=1,  # Defines (unormalised) probability, 1/p, of returning to source node
                q=0.5,  # Defines (unormalised) probability, 1/q, for moving away from source node
                weighted=True,  # for weighted random walks
                seed=42,  # random seed fixed for reproducibility
            )

            str_walks = [[str(n) for n in walk] for walk in weighted_walks]
            weighted_model = Word2Vec(str_walks,
                                      size=self.feats_per_node,
                                      window=5,
                                      min_count=0,
                                      sg=1,
                                      workers=1,
                                      iter=1)

            # Retrieve node embeddings and corresponding subjects
            node_ids = weighted_model.wv.index2word  # list of node IDs
            # change to integer
            for i in range(0, len(node_ids)):
                node_ids[i] = int(node_ids[i])

            weighted_node_embeddings = (
                weighted_model.wv.vectors
            )  # numpy.ndarray of size number of nodes times embeddings dimensionality

            # create dic
            dic = dict(zip(node_ids, weighted_node_embeddings.tolist()))
            # ascending order
            dic = dict(sorted(dic.items()))
            # create matrix
            adj_mat = sp.lil_matrix((self.data.num_nodes, self.feats_per_node))

            for row_idx in node_ids:
                adj_mat[row_idx, :] = dic[row_idx]

            adj_mat = adj_mat.tocsr()
            adj_mat = adj_mat.tocoo()
            coords = np.vstack((adj_mat.row, adj_mat.col)).transpose()
            values = adj_mat.data
            row = list(coords[:, 0])
            col = list(coords[:, 1])
            indexx = torch.LongTensor([row, col])
            tensor_size = torch.Size(
                [self.data.num_nodes, self.feats_per_node])
            degs_out = torch.sparse.FloatTensor(indexx,
                                                torch.FloatTensor(values),
                                                tensor_size)

            hot_1 = {
                'idx': degs_out._indices().t(),
                'vals': degs_out._values()
            }

            return hot_1
Esempio n. 13
0
def node2vec():
    print('Training Node2Vec mode!')

    # initialize results arrays
    total_mse = np.zeros(args.exp_number)

    total_pcc = np.zeros(args.exp_number)
    total_mae = np.zeros(args.exp_number)
    mse_datasets = {}
    std_datasets = {}
    pcc_datasets = {}
    pcc_std_datasets = {}
    mae_datasets = {}
    mae_std_datasets = {}

    t_total = time.time()

    if args.dataset == 'all':
        datasets = [
            'airport', 'collaboration', 'congress', 'forum', 'geom', 'astro'
        ]
    else:
        datasets = [args.dataset]

    for dataset in datasets:
        for exp_number in range(args.exp_number):
            print("%s: experiment number %d" % (dataset, exp_number + 1))

            data = preprocess_dataset.clean_data(dataset)
            if dataset != 'usair':
                data['weights'] = preprocessing.normalize([data['weights']])[0]

            # random split of data
            data_train, data_test = train_test_split(data, test_size=0.2)
            data_train, data_val = train_test_split(data_train, test_size=0.08)

            data = data.reset_index()
            data_train = data_train.reset_index()
            data_val = data_val.reset_index()
            data_test = data_test.reset_index()

            G = preprocess_dataset.create_graph_gcn(dataset, data, data_train)
            val_G = preprocess_dataset.create_graph_gcn(
                dataset, data, data_val)
            test_G = preprocess_dataset.create_graph_gcn(
                dataset, data, data_test)

            nodes_len = len(G.nodes)
            node_ids_to_index = {}
            for i, node_id in enumerate(G.nodes):
                node_ids_to_index[node_id] = i

            train_A = nx.adjacency_matrix(G)
            val_A = nx.adjacency_matrix(val_G)
            test_A = nx.adjacency_matrix(test_G)

            train_labels = torch.FloatTensor(
                data_train['weights'].values).cuda()
            val_labels = torch.FloatTensor(data_val['weights'].values).cuda()
            test_labels = torch.FloatTensor(data_test['weights'].values).cuda()

            train_A = sparse_mx_to_torch_sparse_tensor(train_A).cuda()
            val_A = sparse_mx_to_torch_sparse_tensor(val_A).cuda()
            test_A = sparse_mx_to_torch_sparse_tensor(test_A).cuda()

            G = sg.from_networkx(G)
            rw = BiasedRandomWalk(G)
            weighted_walks = rw.run(
                nodes=G.nodes(),  # root nodes
                length=args.length,  # maximum length of a random walk
                n=args.n_size,  # number of random walks per root node
                p=args.
                p,  # Defines (unormalised) probability, 1/p, of returning to source node
                q=args.
                q,  # Defines (unormalised) probability, 1/q, for moving away from source node
                weighted=True,  # for weighted random walks
                seed=42,  # random seed fixed for reproducibility
            )
            print("Number of random walks: {}".format(len(weighted_walks)))
            weighted_model = Word2Vec(weighted_walks,
                                      vector_size=args.vector_size,
                                      window=5,
                                      min_count=0,
                                      sg=1,
                                      workers=4)
            weights = torch.FloatTensor(weighted_model.wv.vectors).cuda()

            ########################################

            train_n1 = torch.tensor(data_train['A'].values).cuda()
            train_n2 = torch.tensor(data_train['B'].values).cuda()

            train_n1_indices = torch.ones(train_n1.shape[0])
            for i, value in enumerate(train_n1):
                train_n1_indices[i] = node_ids_to_index[value.item()]
            train_n1_indices = train_n1_indices.cuda().long()

            train_n2_indices = torch.ones(train_n1.shape[0])
            for i, value in enumerate(train_n2):
                train_n2_indices[i] = node_ids_to_index[value.item()]
            train_n2_indices = train_n2_indices.cuda().long()

            ########################################

            val_n1 = torch.tensor(data_val['A'].values).cuda()
            val_n2 = torch.tensor(data_val['B'].values).cuda()

            val_n1_indices = torch.ones(val_n1.shape[0])
            for i, value in enumerate(val_n1):
                val_n1_indices[i] = node_ids_to_index[value.item()]
            val_n1_indices = val_n1_indices.cuda().long()

            val_n2_indices = torch.ones(val_n1.shape[0])
            for i, value in enumerate(val_n2):
                val_n2_indices[i] = node_ids_to_index[value.item()]
            val_n2_indices = val_n2_indices.cuda().long()

            ########################################

            test_n1 = torch.tensor(data_test['A'].values).cuda()
            test_n2 = torch.tensor(data_test['B'].values).cuda()

            test_n1_indices = torch.ones(test_n1.shape[0])
            for i, value in enumerate(test_n1):
                test_n1_indices[i] = node_ids_to_index[value.item()]
            test_n1_indices = test_n1_indices.cuda().long()

            test_n2_indices = torch.ones(test_n1.shape[0])
            for i, value in enumerate(test_n2):
                test_n2_indices[i] = node_ids_to_index[value.item()]
            test_n2_indices = test_n2_indices.cuda().long()

            ########################################

            model = Node2Vec(weights, 0.5)
            optimizer = optim.Adam(model.parameters(), lr=args.lr)

            model.train()
            model = model.to(args.device)

            # train
            for epoch in range(args.epochs):
                t = time.time()
                model.train()
                optimizer.zero_grad()

                output = model(train_n1_indices, train_n2_indices)

                loss_train = F.mse_loss(output, train_labels)
                loss_train.backward()
                optimizer.step()

                # validation
                model.eval()
                output = model(val_n1_indices, val_n2_indices)
                loss_val = F.mse_loss(output, val_labels)

                if args.verbose:
                    print('Epoch: {:04d}'.format(epoch + 1),
                          'loss_train: {:.4f}'.format(loss_train.item()),
                          'loss_val: {:.4f}'.format(loss_val.item()),
                          'time: {:.4f}s'.format(time.time() - t))

            # test
            model.eval()
            with torch.no_grad():
                output = model(test_n1_indices, test_n2_indices)

                loss_test = F.mse_loss(torch.flatten(output), test_labels)
                pcc_test = pearson_correlation(test_labels, output)
                mae_test = F.l1_loss(output, test_labels)
                print("Test set results:",
                      "loss= {:.10f}".format(loss_test.item()),
                      "pcc= {:.10f}".format(pcc_test),
                      "mae= {:.10f}".format(mae_test.item()))

                total_mse[exp_number] = loss_test
                total_pcc[exp_number] = pcc_test
                total_mae[exp_number] = mae_test

        # results
        mse_datasets[dataset] = np.mean(total_mse)
        std_datasets[dataset] = np.std(total_mse)
        total_mse = np.zeros(args.exp_number)

        pcc_datasets[dataset] = np.mean(total_pcc[~np.isnan(total_pcc)])
        pcc_std_datasets[dataset] = np.std(total_pcc[~np.isnan(total_pcc)])
        total_pcc = np.zeros(args.exp_number)

        mae_datasets[dataset] = np.mean(total_mae)
        mae_std_datasets[dataset] = np.std(total_mae)
        total_mae = np.zeros(args.exp_number)

    for dataset in datasets:
        print("MSE %s: {:,f}".format(mse_datasets[dataset]) % dataset)
        print("MSE_STD %s: {:,f}".format(std_datasets[dataset]) % dataset)

        print("PCC %s: {:,f}".format(pcc_datasets[dataset]) % dataset)
        print("PCC_STD %s: {:,f}".format(pcc_std_datasets[dataset]) % dataset)

        print("MAE %s: {:,f}".format(mae_datasets[dataset]) % dataset)
        print("MAE_STD %s: {:,f}".format(mae_std_datasets[dataset]) % dataset)

    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

    exit()
completed_file_path = scratch_folder + "/" + use_model_type + "_" + uni_name + ".csv"

# load path of the university path
load_path = file_folder + "/" + uni_name + ".graphml"
# save path of the embedded data
save_path = project_folder + "/" + use_model_type + "_" + uni_name + ".csv"

G_graphml = nx.read_graphml(load_path)
# get the node features as a dataframe, these will then be added to the stellar graph.
# This seems to work better than trying to put them in directly
# nodefeatures = pd.DataFrame.from_dict(dict(G_graphml.nodes(data=True)), orient='index')
# print(nodefeatures)
# Convert the networkx graph to a Stellargraph
G = StellarGraph.from_networkx(G_graphml)

rw = BiasedRandomWalk(G)

walks = rw.run(
    nodes=list(G.nodes()),  # root nodes
    length=30,  # maximum length of a random walk
    n=100,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks,
                 size=dims,
                 window=10,
                 min_count=0,