Esempio n. 1
0
def embedding(args, datadict):
    if not os.path.exists(f'./embedding/{args.dataset}.edgelist'):
        nx.write_edgelist(datadict['g'],
                          f'./embedding/{args.dataset}.edgelist',
                          data=[('weight', int)])
    datadict['g'] = nx.read_edgelist(f'./embedding/{args.dataset}.edgelist',
                                     create_using=nx.DiGraph(),
                                     nodetype=None,
                                     data=[('weight', int)])

    if not os.path.exists(f'./embedding/{args.dataset}_{args.emb_method}.emb'):
        model = DeepWalk(datadict['g'], walk_length=5, num_walks=50, workers=8)
        model.train(window_size=10, iter=10)
        dict_embeddings = model.get_embeddings()
        embeddings = np.zeros(
            (datadict['labels'].shape[0], dict_embeddings['0'].shape[0]))
        print('Saving the embeddings......')
        for key in tqdm(dict_embeddings):
            embeddings[int(key)] = dict_embeddings[key]
        np.savetxt(f'./embedding/{args.dataset}_{args.emb_method}.emb',
                   embeddings)
        print(f'{embeddings.shape[1]}-dims Embeddings saved.')
    else:
        print('Loading the embeddings')
        embeddings = np.loadtxt(
            f'./embedding/{args.dataset}_{args.emb_method}.emb')
        print(f'{embeddings.shape[1]}-dims Embeddings load.')

    return embeddings
Esempio n. 2
0
    def get_embeddings(self, inst, th=1):
        G = nx.parse_edgelist(self._compose_edge_list(inst.dist_mat, th), create_using=nx.DiGraph(), nodetype=None,
                                data=[('weight', float)])
        if self._embedding == 'deepwalk':
            model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
            model.train(window_size=5, iter=3)
        elif self._embedding == 'node2vec':
            model = Node2Vec(G, walk_length=10, num_walks=80, p=0.25, q=4, workers=1)  # init model
            model.train(window_size=5, iter=3)  # train model
        elif self._embedding == 'line':
            model = LINE(G, embedding_size=128, order='second')  # init model,order can be ['first','second','all']
            model.train(batch_size=1024, epochs=50, verbose=2)  # train model
        elif self._embedding == 'sdne':
            model = SDNE(G, hidden_size=[256, 128])  # init model
            model.train(batch_size=3000, epochs=40, verbose=2)  # train model
        elif self._embedding == 'struc2vec':
            model = Struc2Vec(G, 10, 80, workers=4, verbose=40, )  # init model
            model.train(window_size=5, iter=3)  # train model
        else:
            return self._normalise(inst)

        ebds = model.get_embeddings()
        coords = []
        for i in range(inst.n):
            coords.append(ebds[str(i)])
        return np.array(coords)
Esempio n. 3
0
    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()


if __name__ == "__main__":
    G = nx.read_edgelist('./data/wiki/Wiki_edgelist.txt',
                         create_using=nx.DiGraph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
    model.train(window_size=5, iter=3)
    embeddings = model.get_embeddings()
    evaluate_embeddings(embeddings)
    plot_embeddings(embeddings)
    for i in range(1, len(embeddings) + 1):
        emb_list.append(embeddings[str(i)])
    # 将嵌入向量字典转换成列表
    emb_list = np.array(emb_list)
    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)
    # 通过tsne转换成二维坐标
    X, Y = read_node_label('../data/Net300/community.dat', is_net=True)
    # ['1', '2', '3', '4'],[['4'], ['2'], ['4'], ['1']]
    commu_idx = {}
    for i in range(len(X)):
        commu_idx.setdefault(Y[i][0], [])
        commu_idx[Y[i][0]].append(i)

    for c, idx in commu_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)

    plt.legend()
    plt.show()


if __name__ == "__main__":
    G = nx.read_edgelist('../data/Net300/network.dat',
                         create_using=nx.Graph(),
                         nodetype=None)

    model = DeepWalk(G, embed_size=128, walk_length=25, num_walks=5, workers=1)
    model.train(window_size=5, iter=5)

    embeddings = model.get_embeddings()
    plot(embeddings)
Esempio n. 5
0
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()


if __name__ == "__main__":
    G = nx.read_edgelist('../data/wiki/test_small.txt',
                         create_using=nx.DiGraph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = DeepWalk(G, walk_length=4, num_walks=100, workers=30)
    model.train(window_size=50, iter=12)
    embeddings = model.get_embeddings()
    print(embeddings)
    f = open('../data/wiki/_test_vector.txt', 'w')  # 若是'wb'就表示写二进制文件
    j = 0
    for i in embeddings:
        f.write(i + " ")
        for q in embeddings[i]:
            f.write(str(q) + " ")
            # print(str(q))
        f.write('\n')
        j = j + 1
    f.close()
    # evaluate_embeddings(embeddings)
    plot_embeddings(embeddings)
Esempio n. 6
0
        [ge_features,
         pd.DataFrame(features_dr).add_prefix(f'ge_{name}_svd_')],
        axis=1)
    ge_features.to_csv(
        f'../save/graph_embedding_{name}_{graph_name}_svd{svd_dim}.csv')


if __name__ == "__main__":

    G = nx.read_edgelist(graph_path,
                         create_using=nx.DiGraph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = DeepWalk(G,
                     walk_length=dw_walk_length,
                     num_walks=dw_num_walks,
                     workers=NUM_WORKER)  # init model
    model.train(window_size=dw_window_size, iter=dw_iter)  # train model
    embeddings = model.get_embeddings()  # get embedding vectors

    save_embedding(embeddings, name='dw')
    save_embedding_svd(embeddings, name='dw', svd_dim=svd_dim)

    model = model = Struc2Vec(G,
                              walk_length=s2v_walk_length,
                              num_walks=s2v_num_walks,
                              workers=NUM_WORKER,
                              verbose=40)  # init model
    model.train(window_size=s2v_window_size, iter=s2v_iter)  # train model
    embeddings = model.get_embeddings()  # get embedding vectors
    def embedding_feature(self, full_data, key='uid', target='task_id', embedding_size=16, epoch=10, window_size=5,
                          mode='LINE', suffix='cnt', order='second', graph='day'):
        # 调用GraphEmbedding生成全局embedding
        model_path = './yangzhe/model/n2v/{}_{}_{}_{}_{}_{}.pkl'.format(mode, suffix, key, target, graph,
                                                                        embedding_size)

        if not os.path.exists(model_path):
            G = nx.read_edgelist('./yangzhe/feature/graph/{}_{}_{}_graph.csv'.format(target, suffix, graph),
                                 create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
            tf.keras.backend.clear_session()
            if mode == 'LINE':
                model = LINE(graph=G, embedding_size=embedding_size, order=order)
                model.train(batch_size=64, epochs=epoch, verbose=1)
            else:
                model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
                model.train(embed_size=embedding_size, window_size=window_size, workers=5)
            with open(model_path, 'wb') as f:
                pickle.dump(model.get_embeddings(), f)

        # LINE对应的一阶特征与二阶特征
        if order == 'all':
            embedding_size = embedding_size * 2

        # 有些target的embedding是没有学习到的,这些不不存在于这个dict中,所以embedding中没有这些target所对应的行
        with open(model_path, 'rb') as f:
            embedding_dict = pickle.load(f)

        embedding = pd.DataFrame()
        embedding[target] = embedding_dict.keys()
        embedding['embedding'] = [embedding_dict[i] for i in embedding[target].values]
        embedding[target] = embedding[target].astype(int)

        sentences = full_data[[key, target]].groupby([key])[target].agg(list)

        # 这里是根据每个用户的历史曝光target进行均值来求用户的embedding,这些target应该在embedding[target]中
        task_id_have_embedding = set(embedding[target])
        lbl = LabelEncoder()
        lbl.fit(embedding[target])
        emb_matrix = np.array([i for i in embedding['embedding'].values])
        emb_mean = []
        for idx_list in sentences.values.tolist():
            need_key = [x for x in idx_list if x in task_id_have_embedding]
            if len(need_key) == 0:
                mean = np.zeros((embedding_size,))
            else:
                index = lbl.transform(need_key)
                mean = np.mean(emb_matrix[index], axis=0)
            emb_mean.append(mean)
        emb_feature = np.asarray(emb_mean)
        mean_col = ['{}_{}(MainKEY)_{}_MEAN_Window{}_{}'.format(mode, key, target, window_size, i) for i in
                    range(embedding_size)]
        emb_feature = pd.DataFrame(emb_feature, columns=mean_col)
        emb_feature[key] = sentences.index

        # target对应的embedding矩阵也存起来
        embeddings = np.concatenate(embedding['embedding'].values).reshape(-1, embedding_size)
        embeddings = pd.DataFrame(embeddings,
                                  columns=["{}_{}_{}(MainKEY)_Window{}_{}".format(mode, key, target, window_size, i)
                                           for i in range(embedding_size)])
        embedding[embeddings.columns] = embeddings
        del embedding['embedding']

        return emb_feature.reset_index(drop=True), embedding.reset_index(drop=True)
Esempio n. 8
0
                    ] not in edges_temp and edges[j][1] != edges[j][0]:
                    edges_temp.append(edges[j])
                # 找到节点数
                for z in edges[j]:
                    node_num = max(node_num, z)
            edges_list.append(edges_temp)
        node_num += 1
        for edges in edges_list:
            graph = nx.DiGraph()
            graph.add_nodes_from([i for i in range(node_num)])
            graph.add_edges_from(edges)
            graphs.append(graph)

            if 'enron_large' in basepath:
                model = DeepWalk(graph,
                                 walk_length=10,
                                 num_walks=80,
                                 workers=1)
                model.train(window_size=5, iter=3)
            else:
                model = DeepWalk(graph,
                                 walk_length=10,
                                 num_walks=800,
                                 workers=1)
                model.train(window_size=5, iter=3)
            embeddings = model.get_embeddings()
            embeddings_list.append(embeddings)
            pred_edges = []
            emb_matrix = np.zeros([node_num, 128])
            for node, emb in embeddings.items():
                node = int(node)
                emb = emb.reshape((1, emb.shape[0]))