예제 #1
0
def main(args):
    t1 = time.time()
    g = Graph()
    print("Reading...")

    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)

    g.read_node_label(args.label_file)
    g.read_node_features(args.feature_file)
    model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb)
    t2 = time.time()
    print(t2 - t1)
    print("Saving embeddings...")
    model.save_embeddings(args.output)
    vectors = model.vectors
    X, Y = read_node_label(args.label_file)
    print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio *
                                                              100))
    clf = Classifier(vectors=vectors, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, args.clf_ratio)
예제 #2
0
def evaluate_embeddings(embeddings):
    X, Y = read_node_label('../data/wiki/wiki_labels.txt')
    tr_frac = 0.8
    print("Training classifier using {:.2f}% nodes...".format(
        tr_frac * 100))
    clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, tr_frac)
예제 #3
0
def node_classification(session, bs, seqne, sequences, seq_len, node_n, samp_idx, label, ratio):
    enc_sum_dict = {}
    node_cnt = {}
    s_idx, e_idx = 0, bs
    while e_idx < len(sequences):
        batch_enc = session.run(seqne.encoder_output,
                                feed_dict={seqne.input_seqs: sequences[s_idx: e_idx],
                                           seqne.dropout: 0, seqne.keep_prob: 0})
        enc_sum_dict, node_cnt = reduce_seq2seq_hidden_add(enc_sum_dict, node_cnt, sequences,
                                                       batch_enc.astype('float32'), seq_len, s_idx)

        s_idx, e_idx = e_idx, e_idx + bs

    if s_idx < len(sequences):
        batch_enc = session.run(seqne.encoder_output,
                                feed_dict={seqne.input_seqs: sequences[s_idx: len(sequences)],
                                            seqne.dropout: 0,
                                           seqne.keep_prob: 0})
        enc_sum_dict, node_cnt = reduce_seq2seq_hidden_add(enc_sum_dict, node_cnt, sequences,
                                                           batch_enc.astype('float32'), seq_len, s_idx)

    node_enc_mean = reduce_seq2seq_hidden_avg(sum_dict=enc_sum_dict, count_dict=node_cnt, node_num=node_n)
    lr = Classifier(vectors=node_enc_mean, clf=LogisticRegression())
    f1_micro, f1_macro = lr.split_train_evaluate(samp_idx, label, ratio)
    return f1_micro
예제 #4
0
    def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file = None, clf_ratio = 0.5, auto_stop = True):
        self.rep_size = rep_size
        self.order = order
        self.best_result = 0
        self.vectors = {}
        if order == 3:
            self.model1 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=1)
            self.model2 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=2)
            for i in range(epoch):
                self.model1.train_one_epoch()
                self.model2.train_one_epoch()
                if label_file:
                    self.get_embeddings()
                    X, Y = read_node_label(label_file)
                    # print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100)
                    clf = Classifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)

                    if result['micro'] < self.best_result and auto_stop:
                        self.vectors = self.last_vectors
                        print 'Auto stop!'
                        return
                    elif result['micro'] > self.best_result:
                        self.best_result = result['micro']

        else:
            self.model = _LINE(graph, rep_size, batch_size, negative_ratio, order=self.order)
            for i in range(epoch):
                self.model.train_one_epoch()
                if label_file:
                    self.get_embeddings()
                    X, Y = read_node_label(label_file)
                    # print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100)
                    clf = Classifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)

                    if result['micro'] < self.best_result and auto_stop:
                        self.vectors = self.last_vectors
                        print 'Auto stop!'
                        return
                    elif result['micro'] > self.best_result:
                        self.best_result = result['micro']

        self.get_embeddings()
예제 #5
0
def classify(vectors, args):
    if not os.path.isfile(args.classifydir + '_labels.txt'):
        return defaultdict(lambda: 0)
    X, Y = read_node_label(args.classifydir + '_labels.txt')
    print("Training classifier using {:.2f}% nodes...".format(
        args.train_percent * 100))
    clf = Classifier(vectors=vectors,
                     clf=LogisticRegression(solver="lbfgs", max_iter=4000))
    scores = clf.split_train_evaluate(X, Y, args.train_percent)
    return scores
예제 #6
0
def main(args):
    node_embeddings = load_embeddings(args.embedding_file)
    if args.label_file:
        labels = read_node_label(args.label_file)

    if args.modularity:
        print("Modularity")
        modularity(args, node_embeddings, args.min_k, args.max_k)

    if args.reconstruction:
        print("Graph reconstruction")
        reconstr(args, node_embeddings, args.k_nbrs)

    if args.clustering:
        print("Clustering")
        clustering(node_embeddings, labels, args.exp_times)

    if args.link_prediction:
        print("Link prediction")
        link_prediction(args.input, node_embeddings)

    if args.classification:
        X = list(labels.keys())
        Y = list(labels.values())
        print("Node classification")
        clf_ratio_list = args.clf_ratio.strip().split(',')
        result_list = {}
        train_ratio = np.asarray(range(1, 10)) * .1
        for clf_ratio in train_ratio:  # clf_ratio_list:
            result_per_test = []
            for ti in range(args.exp_times):
                clf = Classifier(vectors=node_embeddings, clf=LogisticRegression())
                myresult = clf.split_train_evaluate(X, Y, float(clf_ratio))
                result_per_test.append(myresult)
            result_list[clf_ratio] = result_per_test

        print('-------------------')
        for clf_ratio in train_ratio:
            print('Train percent:', clf_ratio)
            results = result_list[clf_ratio]
            for index, result in enumerate(results):
                print('Shuffle #%d:   ' % (index + 1), result)

            avg_score = defaultdict(float)
            for score_dict in results:
                for metric, score in score_dict.items():
                    avg_score[metric] += score
            for metric in avg_score:
                avg_score[metric] /= len(results)
            print('Average score:', dict(avg_score))
            print('-------------------')
예제 #7
0
def main(args):
    print("xnetmf", "begin...")
    t1 = time.time()
    print("Reading...")
    nx_graph = nx.read_edgelist(agrs.input, nodetype=int, comments="%")
    adj_matrix = nx.adjacency_matrix(nx_graph).todense()
    print(adj_matrix)
    g = Graph(adj_matrix)
    rep_method = RepMethod(
        max_layer=2
    )  # Learn representations with xNetMF.  Can adjust parameters (e.g. as in REGAL)
    representations = src.xnetmf.get_representations(g, rep_method)
    print(representations)
    print(representations.shape)
    print("TAWD", "begin...")
    print("Reading...")
    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)

    g.read_node_label(args.label_file)
    g.read_node_features(args.feature_file)
    model = xtadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb)
    t2 = time.time()
    print(t2 - t1)
    print("Saving embeddings...")
    model.save_embeddings(args.output)
    vectors = model.vectors
    X, Y = read_node_label(args.label_file)
    print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio *
                                                              100))
    clf = Classifier(vectors=vectors, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, args.clf_ratio)
예제 #8
0
def node_classification( embeddings, label_path, name, size):

        X, Y = read_node_label( embeddings, label_path,)

        f_c=open('results/%s_classification_%d.txt'%(name, size), 'w')

        all_ratio=[]

        for tr_frac in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:

               print(" Training classifier using {:.2f}% nodes...".format(tr_frac * 100))
               clf = Classifier(embeddings=embeddings, clf=LogisticRegression(), name=name)
               results= clf.split_train_evaluate(X, Y, tr_frac)

               avg='macro'
               f_c.write(name+' train percentage: '+ str(tr_frac)+ ' F1-'+avg+ ' '+ str('%0.5f'%results[avg]))
               all_ratio.append(results[avg])
               f_c.write('\n')
예제 #9
0
# results_file = open('0127_best_result_wiki.txt', 'w')
# results_file.write(" %s\n" %  best_result)
#
# np.savetxt('best_result_wiki.out', best_result, delimiter='\t')  # X is an array

clf_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
node_size = len(vectors)
train_x = np.array([vectors[x] for x in X])

reshaped_train_x = train_x.reshape((train_x.shape[0], args.kstep, node_size))
train_x = low_encoder.predict(reshaped_train_x)

for clf_one in clf_list:
    print "Training classifier using {:.2f}% nodes...".format(clf_one * 100)
    clf = Classifier(vectors=train_x, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, clf_one)

# y_lable = np.array(Y)
# print(train_x.shape)
# print(type(train_x))
# print(type(y_lable))
#
# np.savetxt('train_x.out', train_x, delimiter='\t')   # X is an array
# np.savetxt('train_Y.out', y_lable.astype(int), delimiter='\t')   # X is an array
#
#
# print(results)
# results_file = open('citeseer_result_0.3.txt', 'w')
# for item in results:
#     results_file.write("%s\n" % item)
예제 #10
0
   Author :       haxu
   date:          2019/4/3
-------------------------------------------------
   Change Activity:
                   2019/4/3:
-------------------------------------------------
"""
__author__ = 'haxu'

import networkx as nx
from deepwalk import DeepWalk
from classify import read_node_label, Classifier
from sklearn.linear_model import LogisticRegression

if __name__ == '__main__':
    G = nx.read_edgelist('../data/Wiki_edgelist.txt',
                         create_using=nx.DiGraph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = DeepWalk(G, walk_length=30, num_walks=80, workers=4)

    model.train(window_size=5, iter=3)
    embeddings = model.get_embeddings()

    X, Y = read_node_label('../data/wiki_labels.txt')

    tr_frac = 0.8
    clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, tr_frac)
예제 #11
0
    for b in tqdm(xrange(batches_per_epoch)):
        #sen_input = sen_gen.next()

        neg_input = neg_gen.next()
        sen_input2 = sen_gen2.next()

        sen_input = sen_input2[:, 0:-1].reshape(
            (args.batch_size, args.kstep, node_size))
        neg_input = neg_input.reshape(
            (args.batch_size, args.neg_size, args.kstep, node_size))

        batch_loss, batch_max_margin_loss = model_auto.train_on_batch(
            [sen_input, neg_input], np.ones((args.batch_size, 1)))

        loss += batch_loss / batches_per_epoch
        max_margin_loss += batch_max_margin_loss / batches_per_epoch

    tr_time = time.time() - t0

###############################################################################################################################
## classification evaluation

node_size = len(vectors)
train_x = np.array([vectors[x] for x in X])
reshaped_train_x = train_x.reshape((train_x.shape[0], args.kstep, node_size))
train_x = low_encoder.predict(reshaped_train_x)

print "Training classifier using {:.2f}% nodes...".format(args.clf_ratio * 100)
clf = Classifier(vectors=train_x, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, args.clf_ratio)
예제 #12
0
def main(args):
    t1 = time.time()
    g = Graph()
    print("Reading...")

    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)
    # if args.method == 'node2vec':
    #     model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
    #                               num_paths=args.number_walks, dim=args.representation_size,
    #                               workers=args.workers, p=args.p, q=args.q, window=args.window_size)
    # elif args.method == 'line':
    #     if args.label_file and not args.no_auto_save:
    #         model = line.LINE(g, epoch=args.epochs, rep_size=args.representation_size, order=args.order,
    #                           label_file=args.label_file, clf_ratio=args.clf_ratio)
    #     else:
    #         model = line.LINE(g, epoch=args.epochs,
    #                           rep_size=args.representation_size, order=args.order)
    # elif args.method == 'deepWalk':
    #     model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
    #                               num_paths=args.number_walks, dim=args.representation_size,
    #                               workers=args.workers, window=args.window_size, dw=True)
    # elif args.method == 'tadw':
    #     # assert args.label_file != ''
    #     assert args.feature_file != ''
    #     g.read_node_label(args.label_file)
    #     g.read_node_features(args.feature_file)
    #     model = tadw.TADW(
    #         graph=g, dim=args.representation_size, lamb=args.lamb)
    # elif args.method == 'gcn':
    #     assert args.label_file != ''
    #     assert args.feature_file != ''
    #     g.read_node_label(args.label_file)
    #     g.read_node_features(args.feature_file)
    #     model = gcnAPI.GCN(graph=g, dropout=args.dropout,
    #                        weight_decay=args.weight_decay, hidden1=args.hidden,
    #                        epochs=args.epochs, clf_ratio=args.clf_ratio)
    # elif args.method == 'grarep':
    #     model = GraRep(graph=g, Kstep=args.kstep, dim=args.representation_size)
    # elif args.method == 'lle':
    #     model = lle.LLE(graph=g, d=args.representation_size)
    # elif args.method == 'hope':
    #     model = hope.HOPE(graph=g, d=args.representation_size)
    if args.method == 'sdne':
        encoder_layer_list = ast.literal_eval(args.encoder_list)
        model = sdne.SDNE(g,
                          encoder_layer_list=encoder_layer_list,
                          alpha=args.alpha,
                          beta=args.beta,
                          nu1=args.nu1,
                          nu2=args.nu2,
                          batch_size=args.bs,
                          epoch=args.epochs,
                          learning_rate=args.lr)
    elif args.method == 'sdne_binary_loss':
        encoder_layer_list = ast.literal_eval(args.encoder_list)
        model = sdne.SDNE(g,
                          encoder_layer_list=encoder_layer_list,
                          alpha=args.alpha,
                          beta=args.beta,
                          nu1=args.nu1,
                          nu2=args.nu2,
                          batch_size=args.bs,
                          epoch=args.epochs,
                          learning_rate=args.lr)
    elif args.method == 'sdne_meta_path':
        encoder_layer_list = ast.literal_eval(args.encoder_list)
        model = sdne.SDNE(g,
                          encoder_layer_list=encoder_layer_list,
                          alpha=args.alpha,
                          beta=args.beta,
                          nu1=args.nu1,
                          nu2=args.nu2,
                          batch_size=args.bs,
                          epoch=args.epochs,
                          learning_rate=args.lr)
    # elif args.method == 'lap':
    #     model = lap.LaplacianEigenmaps(g, rep_size=args.representation_size)
    # elif args.method == 'gf':
    #     model = gf.GraphFactorization(g, rep_size=args.representation_size,
    #                                   epoch=args.epochs, learning_rate=args.lr, weight_decay=args.weight_decay)
    t2 = time.time()
    print('cost time is : {}'.format(t2 - t1))
    if args.method != 'gcn':
        print("Saving embeddings...")
        model.save_embeddings(args.output)
    if args.label_file and args.method != 'gcn':
        vectors = model.vectors
        X, Y = read_node_label(args.label_file)
        print("Training classifier using {:.2f}% nodes...".format(
            args.clf_ratio * 100))
        clf = Classifier(vectors=vectors, clf=LogisticRegression())
        clf.split_train_evaluate(X, Y, args.clf_ratio)