Ejemplo n.º 1
0
def main(graph_fname, node_vec_fname, role_vec_fname, options):
    '''\
    %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname>

    graph_fname: the graph file
        It can be a file contained edges per line (e.g., res/karate_club_edges.txt)
        or a pickled graph file.
    node_vec_fname: the output file for nodes' vectors
    path_vec_fname: the output file for meta-paths' vectors
    '''

    print 'Load a HIN...'
    g = loader.load_a_HIN(graph_fname)

    model = MP2Vec(size=options.dim,
                   window=options.window,
                   neg=options.neg,
                   num_processes=options.num_processes,
                   alpha=options.alpha,
                   is_no_circle_path=False,
                   )

    model.train(g,
                options.walk_num,
                options.walk_length
                )
    model.dump_to_file(node_vec_fname, type_='node')
    model.dump_to_file(role_vec_fname, type_='role')
Ejemplo n.º 2
0
def main(graph_fname, node_vec_fname, path_vec_fname, options):
    '''\
    %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname>

    graph_fname: the graph file
        It can be a file contained edges per line (e.g., res/karate_club_edges.txt)
        or a pickled graph file.
    node_vec_fname: the output file for nodes' vectors
    path_vec_fname: the output file for meta-paths' vectors
    '''

    print 'Load a HIN...'
    g = loader.load_a_HIN(graph_fname)

    print 'Generate random walks...'
    # _, tmp_walk_fname = tempfile.mkstemp()
    tmp_walk_fname = "tmp_walk_fname.txt"
    print tmp_walk_fname
    with open(tmp_walk_fname, 'w') as f:
        for walk in g.random_walks(options.walk_num, options.walk_length):
            f.write('%s\n' % ' '.join(map(str, walk)))

    # _, tmp_node_vec_fname = tempfile.mkstemp()
    # _, tmp_path_vec_fname = tempfile.mkstemp()
    tmp_node_vec_fname = "tmp_node_vec_fname.txt"
    tmp_path_vec_fname = "tmp_path_vec_fname.txt"
    print tmp_node_vec_fname
    print tmp_path_vec_fname

    model = MP2Vec(
        size=options.dim,
        window=options.window,
        neg=options.neg,
        num_processes=options.num_processes,
        #                  iterations=i,
        alpha=options.alpha,
        same_w=True,
        normed=False,
        is_no_circle_path=False,
    )

    neighbors = None
    if options.correct_neg:
        for id_ in g.graph:
            g._get_k_hop_neighborhood(id_, options.window)
        neighbors = g.k_hop_neighbors[options.window]

    model.train(
        g,
        tmp_walk_fname,
        g.class_nodes,
        k_hop_neighbors=neighbors,
    )
    model.dump_to_file(tmp_node_vec_fname, type_='node')
    model.dump_to_file(tmp_path_vec_fname, type_='path')

    print 'Dump vectors...'
    output_node2vec(g, tmp_node_vec_fname, node_vec_fname)
    output_path2vec(g, tmp_path_vec_fname, path_vec_fname)
    return 0
Ejemplo n.º 3
0
def main(graph_fname, node_vec_fname, path_vec_fname, options):
    '''\
    %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname>

    graph_fname: the graph file
        It can be a file contained edges per line (e.g., res/karate_club_edges.txt)
        or a pickled graph file.
    node_vec_fname: the output file for nodes' vectors
    path_vec_fname: the output file for meta-paths' vectors
    '''

    print 'Load a HIN...'
    g = loader.load_a_HIN(graph_fname)

    print 'Generate random walks...'
    _, tmp_walk_fname = tempfile.mkstemp()
    with open(tmp_walk_fname, 'w') as f:
        for walk in g.random_walks(options.walk_num, options.walk_length):
            f.write('%s\n' % ' '.join(map(str, walk)))

    with open("unDiAPwalk.txt",'w') as f:
        for walk in g.random_walks(options.walk_num, options.walk_length):
            f.write('%s\n' % ' '.join(map(str, walk)))

    _, tmp_node_vec_fname = tempfile.mkstemp()
    _, tmp_path_vec_fname = tempfile.mkstemp()
    print 'Learn representations...'
    statement = ("model_c/bin/hin2vec.exe -size %d -train %s -alpha %f "
                 "-output %s -output_mp %s -window %d -negative %d "
                 "-threads %d -no_circle %d -sigmoid_reg %d "
                 "" % (options.dim,
                       tmp_walk_fname,
                       options.alpha,
                       tmp_node_vec_fname,
                       tmp_path_vec_fname,
                       options.window,
                       options.neg,
                       options.num_processes,
                       1-(options.allow_circle * 1),
                       options.sigmoid_reg * 1))
    print statement
    os.system(statement)

    print 'Dump vectors...'
    output_node2vec(g, tmp_node_vec_fname, node_vec_fname)
    output_path2vec(g, tmp_path_vec_fname, path_vec_fname)
    return 0
Ejemplo n.º 4
0
def main(graph_fname, vec_fname, output_fname):
    '''\
    %prog [options]
    '''
    g = loader.load_a_HIN(graph_fname)
    id2name = dict([(id_, name) for name, id_ in g.node2id.items()])

    with open(vec_fname) as f:
        with open(output_fname, 'w') as fo:
            first = True
            for line in f:
                if first:
                    first = False
                    fo.write(line)
                    continue
                tokens = line.split(' ', 1)
                name = id2name[int(tokens[0])]
                fo.write('%s %s' % (name, tokens[1]))

    return 0
Ejemplo n.º 5
0
def main(graph_fname, node_vec_fname, role_vec_fname, graphlet_vec_fname,
         options):
    '''\
    %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname> <graphlet_vec_fname>

    graph_fname: the graph file
        It can be a file contained edges per line (e.g., res/karate_club_edges.txt)
        or a pickled graph file.
    node_vec_fname: the output file for nodes' vectors
    path_vec_fname: the output file for meta-paths' vectors
    '''

    print 'Load a HIN...'
    g = loader.load_a_HIN(graph_fname)
    print len(g.graph)
    for c in g.class_nodes:
        print c, len(g.class_nodes[c])
    print g.edge_class2id
    g.create_node_choices()

    id2classes = {}
    for class_, ids in g.class_nodes.items():
        for id_ in ids:
            id2classes[id_] = class_

    print 'Preprocess graphlet matcher...'
    to_freq = False
    if options.freq_fname is None or not os.path.exists(options.freq_fname):
        to_freq = True
        id2freq = dict(zip(g.graph.keys(), [0] * len(g.graph)))
    else:
        print 'load id2freq', options.freq_fname
        id2freq = cPickle.load(open(options.freq_fname))

    to_matcher = False
    if options.matcher_fname is None or not os.path.exists(
            options.matcher_fname):
        to_matcher = True
        matcher = graphlet.GraphletMatcher()
    else:
        print 'load matcher ', options.matcher_fname
        matcher = cPickle.load(open(options.matcher_fname))

    if to_freq or to_matcher:
        for walk in g.random_walks(1, 100):
            if to_matcher:
                for id2degrees in graphlet.complete_and_count_degrees(
                        g, options.window, walk):
                    matcher.get_graphlet(id2classes, id2degrees)
            if to_freq:
                for id_ in walk:
                    id2freq[id_] += 1

        if to_freq and options.freq_fname is not None:
            print 'dump id2freq', options.freq_fname
            cPickle.dump(id2freq, open(options.freq_fname, 'w'))
        if to_matcher and options.matcher_fname is not None:
            print 'dump matcher', options.matcher_fname
            cPickle.dump(matcher, open(options.matcher_fname, 'w'))

    print matcher.graphlets
    print 'graphlet:', len(matcher.graphlets)
    print 'roles:', matcher.rid_offset
    tmp_freq_fname = '/tmp/ms_freq.txt'
    with open(tmp_freq_fname, 'w') as f:
        for id_, freq in sorted(id2freq.items()):
            f.write('%d\n' % (freq))

    print 'Generate training set'
    _, tmp_node_vec_fname = tempfile.mkstemp()
    _, tmp_role_vec_fname = tempfile.mkstemp()

    tmp_data_fname = options.training_fname
    to_generate = False
    if options.training_fname is None:
        _, tmp_data_fname = tempfile.mkstemp()
        print tmp_data_fname
        to_generate = True
    elif not os.path.exists(options.training_fname):
        to_generate = True

    if to_generate:
        graphlet.generate_training_set_to_file(
            g,
            matcher,
            id2classes,
            options.walk_length,
            options.window,
            tmp_data_fname,
            num_processes=options.num_processes)

    print 'Learn representations...'
    model = 'ms2vec'
    if options.mode:
        model = 'ms2vec_c2t'
    statement = ("model_c/bin/%s -size %d -node_count %d "
                 "-role_count %d -graphlet_count %d -role_ratio %f "
                 "-train %s -freq %s -alpha %f "
                 "-output %s -output_role %s -output_graphlet %s "
                 "-window %d -negative %d "
                 "-threads %d -sigmoid_reg %d -iteration %d -equal %d"
                 "" %
                 (model, options.dim, max(g.graph) + 1, matcher.rid_offset,
                  len(matcher.graphlets), options.role_ratio, tmp_data_fname,
                  tmp_freq_fname, options.alpha, tmp_node_vec_fname,
                  tmp_role_vec_fname, graphlet_vec_fname, options.window,
                  options.neg, options.num_processes, options.sigmoid_reg * 1,
                  options.iter, options.equal * 1))
    print statement
    os.system(statement)

    output_node2vec(g, tmp_node_vec_fname, node_vec_fname)
    output_role2vec(matcher, tmp_role_vec_fname, role_vec_fname)
Ejemplo n.º 6
0
def main(graph_fname, node_vec_fname, path_vec_fname, options):
    #DONE: Get init parameters as arguements to program
    #DONE: num nodes, get length of graph
    #DONE:  num rel get from len node_vocab/path_vocab, didnt work, just use node# for now

    #TODO(NEWEST) get working with batches and fix shape of objective function to work with them, pick a good cross entropy function, sigmoid?
    #TODO(NEWEST) get p_ into same format as mnist labels [0,1],[1,0] change p_ shape to [None, 2]

    print('Load a HIN...')
    g = loader.load_a_HIN(graph_fname)

    NUM_NODES = g.node_count()
    NUM_REL = g.node_count()

    print('Generate random walks...')
    _, tmp_walk_fname = tempfile.mkstemp()
    print('DEBUG: Saving random walks to file...')
    randomWalkFile = open('randWalks.txt', 'w')
    print(tmp_walk_fname)
    with open(tmp_walk_fname, 'w') as f:
        for walk in g.random_walks(options.walk_num, options.walk_length):
            f.write('%s\n' % ' '.join(map(str, walk)))
            randomWalkFile.write('%s\n' % ' '.join(map(str, walk)))


    _, tmp_node_vec_fname = tempfile.mkstemp()
    _, tmp_path_vec_fname = tempfile.mkstemp()

    # node_vocab = mp.NodeVocab.load_from_file(tmp_walk_fname)
    # path_vocab = mp.PathVocab.load_from_file(tmp_walk_fname,options.window)

    model = MP2Vec(size=options.dim,
                   window=options.window,
                   neg=options.neg,
                   num_processes=options.num_processes,
                   #                  iterations=i,
                   alpha=options.alpha,
                   same_w=True,
                   normed=False,
                   is_no_circle_path=False,
                   )

    neighbors = None
    if options.correct_neg:
        for id_ in g.graph:
            g._get_k_hop_neighborhood(id_, options.window)
        neighbors = g.k_hop_neighbors[options.window]

    model.train(g,
                tmp_walk_fname,
                g.class_nodes,
                k_hop_neighbors=neighbors,
                )

    #TODO: store neg/pos data without using file
    #DONE: format neg data for tensorflow


    # Load training samples from file into list
    x_data = []
    y_data = []
    r_data = []
    p_data = []

    #TODO: Fix the append causes errors randomly, there is problem with generation of test data to file
    with open("pos_data.txt", "r") as pos:
        with open("neg_data.txt", "r") as neg:
            for l in pos:
                temp_pos = l.strip().split(",")
                # Read pos training data
                if (len(temp_pos) == 3):
                    if temp_pos[0] is not None and temp_pos[1] is not None and temp_pos[2] is not None:
                        x_data.append(int(temp_pos[0]))
                        y_data.append(int(temp_pos[1]))
                        r_data.append(int(temp_pos[2]))
                        p_data.append(int(1))

                        # Read neg training data
                        temp_neg = neg.readline().strip().split(",")
                        for j in temp_neg:
                            if (j is not None and j != ""):
                                x_data.append(int(temp_pos[0]))
                                y_data.append(int(j))
                                r_data.append(int(temp_pos[2]))
                                p_data.append(int(0))

    # Debugging
    test = open("read_data.txt", "w+")
    for i in range(len(x_data)):
        test.write(str(x_data[i]) + "," + str(y_data[i]) + "," + str(r_data[i]) + "," + str(p_data[i]) + "\n")



    #for i in range(0, len(x_data)):
    #    print (x_data[i], y_data[i], r_data[i])


    # Convert the data to numpy array
    x_np = np.array(x_data)
    y_np = np.array(y_data)
    r_np = np.array(r_data)
    p_np = np.array(p_data)

    # Convert the x,y,r into 1-hot numpy array
    x_onehot = np.zeros((len(x_data), NUM_NODES))
    x_onehot[np.arange(len(x_data)), x_np] = 1

    y_onehot = np.zeros((len(y_data), NUM_NODES))
    y_onehot[np.arange(len(y_data)), y_np] = 1

    r_onehot = np.zeros((len(r_data), NUM_NODES))
    r_onehot[np.arange(len(r_data)), r_np] = 1



    #DONE: Prepare positive training data to get form (x,y,r,L(x,y,r))
    #DONE: Prepare negative samples (x'',y'',r'') for  each positive entry
    #DONE: Generate bathes from training set
    #DONE: Get g_ val from training data

    # Input
    #x = tf.placeholder(tf.float32, [None, NUM_NODES])
    #y = tf.placeholder(tf.float32, [None, NUM_NODES])
    #r = tf.placeholder(tf.float32, [None, NUM_REL])
    p_ = tf.placeholder(tf.float32, [None])

    x_id = tf.placeholder(tf.int32, [None])
    y_id = tf.placeholder(tf.int32, [None])
    r_id = tf.placeholder(tf.int32, [None])



    #DONE: instantiate learned weights with rand uniform dist (mp2vec_s.py)
    #TODO: Case where Wx = Wy
    # Learned weights


    Wx = tf.Variable(np.random.uniform(low=-0.5/options.dim,
                                high=0.5/options.dim,
                                size=(NUM_NODES, options.dim)).astype(np.float32))
    Wy = tf.Variable(np.random.uniform(low=-0.5 / options.dim,
                                       high=0.5 / options.dim,
                                       size=(NUM_NODES, options.dim)).astype(np.float32))
    Wr = tf.Variable(np.random.uniform(low=0.0/options.dim,
                                high=1.0/options.dim,
                                       size=(NUM_NODES, options.dim)).astype(np.float32))

    # Aggregate Vectors
    #Wx_x = tf.matmul(x, Wx)
    #Wy_y = tf.matmul(y, Wy)

    Wx_x_id = tf.nn.embedding_lookup(Wx, x_id)
    Wy_y_id = tf.nn.embedding_lookup(Wx, y_id)
    Wr_r_id = tf.round(tf.nn.sigmoid(tf.nn.embedding_lookup(Wr, r_id)))

    #TODO: Make binary step default and option to use sigmoid
    # Regularization of Wr (Binary step by rounding sigmoid)
    #Wr_r = tf.round(tf.nn.sigmoid(tf.matmul(r, Wr)))

    # Regularization of Wr (just sigmoid)
    #Wr_r = tf.nn.sigmoid(tf.multiply(tWr, r))

    #DONE: Find a way to do element-wise mult with diff size tensors (Wr_r is diff size), make one-hot
    # Hidden Layer (element-wise mult)
    #h = tf.multiply(tf.multiply(Wx_x, Wy_y), Wr_r)
    h = tf.multiply(tf.multiply(Wx_x_id, Wy_y_id), Wr_r_id)


    # Output (sigmoid of element summation), reduce the sum along dim and keep [None] dim
    #p = tf.nn.sigmoid(tf.reduce_sum(h, 1))
    p = tf.reduce_sum(h, 1, keep_dims=False)


    #TODO: Implement obj version for binary step
    # Objective Function
    #objective = tf.cond(tf.equal(p_,  1), lambda: p, lambda: 1-p)
    objective = tf.nn.sigmoid_cross_entropy_with_logits(labels=p_, logits=p)
    #objective = tf.nn.softmax_cross_entropy_with_logits(labels=p_, logits=p)
    #objective = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=p_, logits=p)



    #DONE: Take negative of obj or not?
    # Train step (want to minimize negative of objective)
    train_step = tf.train.GradientDescentOptimizer(options.alpha).minimize(objective)


    #DONE: calculate accuracy correctly
    # Accuracy
    correct_prediction = tf.equal(tf.round(tf.sigmoid(p)), p_)


    u = tf.cast(correct_prediction,tf.float32)
    accuracy = tf.reduce_mean(u)

    # Training
    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()



    #TODO: integreate -i iterations option
    #TODO: train all data one at a time? or train random data points at a time?

    step = 50
    for j in xrange(0,1):
        print ("epoch " + str(j) + " -----------------")
        for i in xrange(0, len(x_data), step):
            #x_val = x_onehot[i:i+step]
            #y_val = y_onehot[i:i+step]
            #r_val = r_onehot[i:i+step]
            p__val = p_np[i:i+step]

            x_id_val = x_data[i:i+step]
            y_id_val = y_data[i:i+step]
            r_id_val = r_data[i:i+step]

            #print(str(x_id_val) + str(y_id_val) + str(r_id_val) + str(p__val))
            #raw_input()

            '''
            Wx_row = sess.run(Wx_x_id, feed_dict={p_: p__val,
                                                  x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            Wy_row = sess.run(Wy_y_id, feed_dict={p_: p__val,
                                                  x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            Wr_row = sess.run(Wr_r_id, feed_dict={p_: p__val,
                                                  x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            p_res = sess.run(p, feed_dict={p_: p__val,
                                                 x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            print(Wx_row)
            print(Wy_row)
            print(Wr_row)
            print(p_res)
            raw_input()
            '''

            sess.run(train_step, feed_dict={p_: p__val,
                                            x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            train_accuracy = sess.run(accuracy, feed_dict={p_: p__val,
                                            x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            print('step %d, training accuracy %f' % (i, train_accuracy))


            Wxx = sess.run(Wx, feed_dict={p_: p__val,
                                        x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})


            np.savetxt('node_vec_temp.txt', Wxx, delimiter=' ')




            '''
            Wx_row = sess.run(Wx_x_id, feed_dict={p_: p__val,
                                                  x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            Wy_row = sess.run(Wy_y_id, feed_dict={p_: p__val,
                                                  x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            Wr_row = sess.run(Wr_r_id, feed_dict={p_: p__val,
                                                  x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            p_res = sess.run(p, feed_dict={p_: p__val,
                                                 x_id: x_id_val, y_id: y_id_val, r_id: r_id_val})

            print(Wx_row)
            print(Wy_row)
            print(Wr_row)
            print(p_res)
            raw_input()
            '''

    #Output vectors to file
    lines = []
    lines.append(str(NUM_NODES) + " " + str(options.dim) + "\n")

    tmpLines = []
    with open("node_vec_temp.txt", "r") as f:
        for l in f:
            tmpLines.append(l)

    id2name = dict([(id_, name) for name, id_ in g.node2id.items()])
    for i in xrange(0, NUM_NODES):
        lines.append(str(id2name[i]) + " " + tmpLines[i])

    with open("node_vec.txt", "w+") as out:
        for l in lines:
            out.write(l)