コード例 #1
0
def node2vec_embedding(graph):

    p = 1.0
    q = 1.0
    dimensions = 128
    num_walks = 10
    walk_length = 80
    window_size = 10
    num_iter = 1
    workers = multiprocessing.cpu_count()

    graph = StellarGraph(graph)
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)

    print(f"Number of random walks: {len(walks)}")

    model = Word2Vec(walks,
                     size=dimensions,
                     window=window_size,
                     min_count=0,
                     sg=1,
                     workers=workers,
                     iter=num_iter)

    features = pd.DataFrame(data=model.wv.vectors, index=list(graph.nodes()))
    features.index = features.index.map(str)

    return features
コード例 #2
0
def execute_model(
    dict_node_df,
    df_edges
):
    global dataset
    global model_save_path
    global metapaths
    global emb_dim
    global num_walks_per_node
    global walk_length
    print('Metapaths ', metapaths)

    emb_fpath = os.path.join(
        model_save_path,
        'n2v_{}_{}_{}.npy'.format(
            emb_dim, num_walks_per_node, walk_length)
    )
    graph_obj = StellarGraph(
        dict_node_df,
        df_edges
    )

    walks_save_file = "n2v_random_walks_{}_{}.npy".format(walk_length, num_walks_per_node)
    walks_save_file = os.path.join(model_use_data_DIR, walks_save_file)
    try:
        walks_np_arr = np.load(walks_save_file)
        walks = [list(_) for _ in walks_np_arr]
    except:
        walks = generate_random_walks(
            graph_obj, num_walks_per_node, walk_length, metapaths)
        walks_np_arr = np.array(walks)
        np.save(walks_save_file, walks_np_arr)

    print("Number of random walks: {}".format(len(walks)))
    str_walks = [[str(n) for n in walk] for walk in walks]

    if not os.path.exists(emb_fpath):

        word2vec_params = {
            'sg': 0,
            "size": emb_dim,
            "alpha": 0.5,
            "min_alpha": 0.001,
            'window': 5,
            'min_count': 0,
            "workers": multiprocessing.cpu_count(),
            "negative": 1,
            "hs": 0,  # 0: negative sampling, 1:hierarchical  softmax
            'compute_loss': True,
            'iter': 10,
            'cbow_mean': 1,
        }

        iters = 20
        mp2v_model = Word2Vec(**word2vec_params)
        mp2v_model.build_vocab(str_walks)
        losses = []
        learning_rate = 0.5
        step_size = (0.5 - 0.001) / iters

        for i in tqdm(range(iters)):
            trained_word_count, raw_word_count = mp2v_model.train(
                str_walks,
                compute_loss=True,
                start_alpha=learning_rate,
                end_alpha=learning_rate,
                total_examples=mp2v_model.corpus_count,
                epochs=1
            )
            loss = mp2v_model.get_latest_training_loss()
            losses.append(loss)
            print('>> ', i, ' Loss:: ', loss, learning_rate)
            learning_rate -= step_size

        # ======== Save node weights ============ #
        node_embeddings = []
        for i in range(len(graph_obj.nodes())):
            vec = mp2v_model.wv[str(i)]
            node_embeddings.append(vec)

        node_embeddings = np.array(node_embeddings)
        np.save(emb_fpath, node_embeddings)
    else:
        node_embeddings = np.load(emb_fpath)

    return node_embeddings
コード例 #3
0
ファイル: ctdne.py プロジェクト: Star607/Dynamic-Graph
def ctdne(dataset="ia-contact",
          output="/nfs/zty/Graph/Dynamic-Graph/ctdne_output"):
    # if os.path.exists("{}/{}.emb".format(output, dataset)):
    #     return
    print("Begin CTDNE {} embeddings.".format(dataset))
    train_dir = "/nfs/zty/Graph/train_data"
    df = pd.read_csv("{}/{}.csv".format(train_dir, dataset))
    edges_df = df[df["label"] == 1].copy()
    print("original edges: {} filtering edges: {}".format(
        len(df), len(edges_df)))
    edges_df = edges_df[["from_idx", "to_idx", "timestamp"]].copy()
    edges_df.columns = ["source", "target", "time"]
    edges_df["source"] = edges_df["source"].astype(str)
    edges_df["target"] = edges_df["target"].astype(str)
    # nodes = list(set(edges_df["source"]).union(set(edges_df["target"])))
    graph = StellarGraph(
        edges=edges_df,
        edge_weight_column="time",
    )

    num_walks_per_node = 10
    walk_length = 80
    context_window_size = 10
    num_cw = len(graph.nodes()) * num_walks_per_node *\
        (walk_length - context_window_size + 1)

    print("Begin CTDNE TemporalRandomWalk.")
    temporal_rw = TemporalRandomWalk(graph)
    temporal_walks = temporal_rw.run(
        num_cw=num_cw,
        cw_size=context_window_size,
        max_walk_length=walk_length,
        walk_bias="exponential",
    )
    print("End CTDNE TemporalRandomWalk.")
    embedding_size = 128
    temporal_model = Word2Vec(temporal_walks,
                              size=embedding_size,
                              window=context_window_size,
                              min_count=0,
                              sg=1,
                              workers=16,
                              iter=1)

    print("Done CTDNE {} embeddings.".format(dataset))

    # if not os.path.exists("{}/{}.emb".format(output, dataset)):
    wv = temporal_model.wv
    vecs = np.array([wv[u] for u in wv.vocab])
    df = pd.DataFrame(vecs, index=wv.vocab)
    walks_nodes = set([s for l in temporal_walks for s in l])
    embed_nodes = set(wv.vocab.keys())
    nodes = set(edges_df["source"]).union(set(edges_df["target"]))
    print("{} nodes not exist in walk_nodes.".format(len(nodes - walks_nodes)))
    print("{} nodes not exist in embed_nodes.".format(len(nodes -
                                                          embed_nodes)))

    emb_path = "{}/{}.emb".format(output, dataset)
    if not os.path.exists(emb_path):
        f = open(emb_path, "w")
        f.close()
        os.chmod(emb_path, 0o777)
    df.to_csv("{}/{}.emb".format(output, dataset), header=None)
コード例 #4
0
        def get_node_feats(adj):  # input is cur_adj

            edgelist = adj['idx'].cpu().data.numpy()
            source = edgelist[:, 0]
            target = edgelist[:, 1]
            weight = np.ones(len(source))

            G = pd.DataFrame({
                'source': source,
                'target': target,
                'weight': weight
            })
            G = StellarGraph(edges=G)
            rw = BiasedRandomWalk(G)

            weighted_walks = rw.run(
                nodes=list(G.nodes()),  # root nodes
                length=2,  # maximum length of a random walk
                n=5,  # number of random walks per root node
                p=1,  # Defines (unormalised) probability, 1/p, of returning to source node
                q=0.5,  # Defines (unormalised) probability, 1/q, for moving away from source node
                weighted=True,  # for weighted random walks
                seed=42,  # random seed fixed for reproducibility
            )

            str_walks = [[str(n) for n in walk] for walk in weighted_walks]
            weighted_model = Word2Vec(str_walks,
                                      size=self.feats_per_node,
                                      window=5,
                                      min_count=0,
                                      sg=1,
                                      workers=1,
                                      iter=1)

            # Retrieve node embeddings and corresponding subjects
            node_ids = weighted_model.wv.index2word  # list of node IDs
            # change to integer
            for i in range(0, len(node_ids)):
                node_ids[i] = int(node_ids[i])

            weighted_node_embeddings = (
                weighted_model.wv.vectors
            )  # numpy.ndarray of size number of nodes times embeddings dimensionality

            # create dic
            dic = dict(zip(node_ids, weighted_node_embeddings.tolist()))
            # ascending order
            dic = dict(sorted(dic.items()))
            # create matrix
            adj_mat = sp.lil_matrix((self.data.num_nodes, self.feats_per_node))

            for row_idx in node_ids:
                adj_mat[row_idx, :] = dic[row_idx]

            adj_mat = adj_mat.tocsr()
            adj_mat = adj_mat.tocoo()
            coords = np.vstack((adj_mat.row, adj_mat.col)).transpose()
            values = adj_mat.data
            row = list(coords[:, 0])
            col = list(coords[:, 1])
            indexx = torch.LongTensor([row, col])
            tensor_size = torch.Size(
                [self.data.num_nodes, self.feats_per_node])
            degs_out = torch.sparse.FloatTensor(indexx,
                                                torch.FloatTensor(values),
                                                tensor_size)

            hot_1 = {
                'idx': degs_out._indices().t(),
                'vals': degs_out._values()
            }

            return hot_1