Ejemplo n.º 1
0
def initiEmbeddings():
    if not os.path.exists(basepath + 'line_embeddings.emb'):
        G = nx.read_gml(basepath + "network.gml", label=None)
        model = LINE(G, embedding_size=128, order='second')
        model.train(batch_size=1024, epochs=150, verbose=2)
        embeddings = model.get_embeddings()
        # print (embeddings)
        t = []
        for key in embeddings:
            t.append(embeddings[key])
            #print (embeddings[key])
        np.savetxt(basepath + 'line_embeddings.emb', np.array(t))

    # Emneddings = np.loadtxt(basepath + 'line_embeddings.emb')
    Embeddings = np.loadtxt(basepath + 'line_embeddings.emb')
    return Embeddings
Ejemplo n.º 2
0
    def get_embeddings(self, inst, th=1):
        G = nx.parse_edgelist(self._compose_edge_list(inst.dist_mat, th), create_using=nx.DiGraph(), nodetype=None,
                                data=[('weight', float)])
        if self._embedding == 'deepwalk':
            model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
            model.train(window_size=5, iter=3)
        elif self._embedding == 'node2vec':
            model = Node2Vec(G, walk_length=10, num_walks=80, p=0.25, q=4, workers=1)  # init model
            model.train(window_size=5, iter=3)  # train model
        elif self._embedding == 'line':
            model = LINE(G, embedding_size=128, order='second')  # init model,order can be ['first','second','all']
            model.train(batch_size=1024, epochs=50, verbose=2)  # train model
        elif self._embedding == 'sdne':
            model = SDNE(G, hidden_size=[256, 128])  # init model
            model.train(batch_size=3000, epochs=40, verbose=2)  # train model
        elif self._embedding == 'struc2vec':
            model = Struc2Vec(G, 10, 80, workers=4, verbose=40, )  # init model
            model.train(window_size=5, iter=3)  # train model
        else:
            return self._normalise(inst)

        ebds = model.get_embeddings()
        coords = []
        for i in range(inst.n):
            coords.append(ebds[str(i)])
        return np.array(coords)
def embedding_trainer(G,
                      embedder,
                      epochs=250,
                      seed=1234,
                      learning_rate=0.05,
                      embedding_dim=96,
                      batch_size=1024,
                      walk_length=30,
                      num_walks=200,
                      window=10,
                      p=1.0,
                      q=1.0,
                      workers=1,
                      temp_folder=None):
    if embedder == 'node2vec':
        if not os.path.isdir(temp_folder):
            os.mkdir(temp_folder)
        model = Node2Vec(G,
                         dimensions=embedding_dim,
                         walk_length=walk_length,
                         num_walks=num_walks,
                         p=p,
                         q=q,
                         weight_key='weight',
                         workers=5,
                         temp_folder=temp_folder)
        model = model.fit(window=window,
                          min_count=1,
                          seed=seed,
                          alpha=learning_rate,
                          batch_words=4)
        model.wv.save_word2vec_format('./temp_embeddings_file.emb')
        embeddings = node2vec_embedder('temp_embeddings_file.emb')
        os.remove('./temp_embeddings_file.emb')
    elif embedder == 'line':
        model = LINE(G, embedding_size=embedding_dim, order='second')
        model.train(batch_size=batch_size, epochs=epochs, verbose=2)
        embeddings = model.get_embeddings()
    elif embedder == 'rolx':
        # embeddings = np.load('./data/rolx_embeddings.npy')[()]
        with open(args["directory+'/embeddings/rolx_embedding.json'"]) as fp:
            embeddings = json.load(fp)
        # embeddings = {str(k):v for k,v in embeddings.items()}
    return embeddings
Ejemplo n.º 4
0
    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()


if __name__ == "__main__":
    G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
                         create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])

    model = LINE(G, embedding_size=128, order='second')
    model.train(batch_size=1024, epochs=1, verbose=1)
    embeddings = model.get_embeddings()

    evaluate_embeddings(embeddings)
    plot_embeddings(embeddings)
Ejemplo n.º 5
0
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()


if __name__ == "__main__":
    G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
                         create_using=nx.DiGraph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = LINE(G, embedding_size=128, order='all')
    model.train(batch_size=1024, epochs=10, verbose=2)
    embeddings = model.get_embeddings()

    evaluate_embeddings(embeddings)
    plot_embeddings(embeddings)
Ejemplo n.º 6
0
    # if config.model_name.lower() == 'deepwalk':
    #     model = DeepWalk(G, walk_length=config.walk_length, num_walks=config.num_walks, workers=config.workers)
    # elif config.model_name.lower() == 'line':
    #     model = LINE(G, embedding_size=config.embedding_size, order='second')
    # elif config.model_name.lower() == 'struc2vec':
    #     model = Struc2Vec(G, 10, 80, workers=4, verbose=40, )
    # elif config.model_name.lower() == 'sdne':
    #     model = SDNE(G, hidden_size=[256, 128], )
    # elif config.model_name.lower() == 'node2vec':
    #     model = Node2Vec(G, walk_length=config.walk_length, num_walks=config.num_walks,p=0.25, q=4, workers=config.workers)
    # else:
    #     model = DeepWalk(G, walk_length=config.walk_length, num_walks=config.num_walks, workers=config.workers)
    #     print('参数文件中模型名称错误,采用默认的deepwalk模型')

    model = LINE(G,
                 embedding_size=config.embedding_size,
                 order=config.line_order,
                 negative_ratio=config.negative_ratio)
    model.train(batch_size=config.line_batch_size, epochs=50, verbose=2)

    # embeddings = model.get_embeddings()
    # evaluate_embeddings(embeddings)
    # plot_embeddings(embeddings)

    # entity_dict = {}
    # f = open('/Users/admin/Desktop/GraphEmbedding-deeplearning/data/XunYiWenYao/寻医问药category.txt','r',encoding='utf-8')
    # for i in f.readlines():
    #     entity_dict[i.strip().split(' ')[0]] = i.strip().split(' ')[1]
    #
    # max_similarity = 0
    # max_similarity_tuple = []
    #
    def embedding_feature(self, full_data, key='uid', target='task_id', embedding_size=16, epoch=10, window_size=5,
                          mode='LINE', suffix='cnt', order='second', graph='day'):
        # 调用GraphEmbedding生成全局embedding
        model_path = './yangzhe/model/n2v/{}_{}_{}_{}_{}_{}.pkl'.format(mode, suffix, key, target, graph,
                                                                        embedding_size)

        if not os.path.exists(model_path):
            G = nx.read_edgelist('./yangzhe/feature/graph/{}_{}_{}_graph.csv'.format(target, suffix, graph),
                                 create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
            tf.keras.backend.clear_session()
            if mode == 'LINE':
                model = LINE(graph=G, embedding_size=embedding_size, order=order)
                model.train(batch_size=64, epochs=epoch, verbose=1)
            else:
                model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
                model.train(embed_size=embedding_size, window_size=window_size, workers=5)
            with open(model_path, 'wb') as f:
                pickle.dump(model.get_embeddings(), f)

        # LINE对应的一阶特征与二阶特征
        if order == 'all':
            embedding_size = embedding_size * 2

        # 有些target的embedding是没有学习到的,这些不不存在于这个dict中,所以embedding中没有这些target所对应的行
        with open(model_path, 'rb') as f:
            embedding_dict = pickle.load(f)

        embedding = pd.DataFrame()
        embedding[target] = embedding_dict.keys()
        embedding['embedding'] = [embedding_dict[i] for i in embedding[target].values]
        embedding[target] = embedding[target].astype(int)

        sentences = full_data[[key, target]].groupby([key])[target].agg(list)

        # 这里是根据每个用户的历史曝光target进行均值来求用户的embedding,这些target应该在embedding[target]中
        task_id_have_embedding = set(embedding[target])
        lbl = LabelEncoder()
        lbl.fit(embedding[target])
        emb_matrix = np.array([i for i in embedding['embedding'].values])
        emb_mean = []
        for idx_list in sentences.values.tolist():
            need_key = [x for x in idx_list if x in task_id_have_embedding]
            if len(need_key) == 0:
                mean = np.zeros((embedding_size,))
            else:
                index = lbl.transform(need_key)
                mean = np.mean(emb_matrix[index], axis=0)
            emb_mean.append(mean)
        emb_feature = np.asarray(emb_mean)
        mean_col = ['{}_{}(MainKEY)_{}_MEAN_Window{}_{}'.format(mode, key, target, window_size, i) for i in
                    range(embedding_size)]
        emb_feature = pd.DataFrame(emb_feature, columns=mean_col)
        emb_feature[key] = sentences.index

        # target对应的embedding矩阵也存起来
        embeddings = np.concatenate(embedding['embedding'].values).reshape(-1, embedding_size)
        embeddings = pd.DataFrame(embeddings,
                                  columns=["{}_{}_{}(MainKEY)_Window{}_{}".format(mode, key, target, window_size, i)
                                           for i in range(embedding_size)])
        embedding[embeddings.columns] = embeddings
        del embedding['embedding']

        return emb_feature.reset_index(drop=True), embedding.reset_index(drop=True)
Ejemplo n.º 8
0
                        default="cora",
                        help='dataset for training')
    parser.add_argument('--tr_frac', type=float, default=0.2, help='tr_frac')

    args = parser.parse_args()
    # load_data
    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels = load_data(
        args.dataset)
    idx = np.argmax(labels, axis=1)
    f = open('labels.txt', 'w')
    for i in range(labels.shape[0]):
        f.write(f'{i} {idx[i]} \n')

    adj = adj.toarray()
    G = nx.from_numpy_array(adj, create_using=nx.DiGraph())
    nx.write_edgelist(G, "test.edgelist", data=[('weight', int)])
    # G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
    #                      create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
    G = nx.read_edgelist('test.edgelist',
                         create_using=nx.DiGraph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = LINE(G, embedding_size=128,
                 order='second')  # ‘first' 'second' 'all'
    model.train(batch_size=1024, epochs=50, verbose=2)
    embeddings = model.get_embeddings()

    evaluate_embeddings(embeddings, args.tr_frac)
    plot_embeddings(embeddings)
Ejemplo n.º 9
0
                    G,
                    hidden_size=[256, 128],
                )
                model.train(batch_size=3000, epochs=40, verbose=2)
            elif (mdl[j] == "Struc2Vec"):
                model = Struc2Vec(
                    G,
                    10,
                    80,
                    workers=4,
                    verbose=40,
                )
                model.train(window_size=5, iter=3)
            elif (mdl[j] == "DeepWalk"):
                model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
                model.train(window_size=5, iter=3)
            elif (mdl[j] == "LINE"):
                model = LINE(G, embedding_size=128, order='second')
                model.train(batch_size=1024, epochs=50, verbose=2)

            embeddings = model.get_embeddings()

            evaluate_embeddings(embeddings,
                                X,
                                Y,
                                strat_X_train,
                                strat_y_train,
                                strat_X_test,
                                strat_y_test,
                                log_key=mdl[j] + ": " + d_set[i])
            plot_embeddings(embeddings, d_set[i])
Ejemplo n.º 10
0
"""使用LINE模型获得节点向量"""
import sys

sys.path.append(r'D:\pythonplaces\citation-recommendation')
from ge import LINE
import networkx as nx

if __name__ == "__main__":
    G = nx.read_edgelist('../data/aan/new_aan_normal_edges.txt',
                         create_using=nx.Graph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = LINE(G, embedding_size=128, order='first')
    model.train(batch_size=1024, epochs=15, verbose=2)
    embeddings = model.get_embeddings(emb_filepath="../embs/aan_line.json")