def main(graph_fname, node_vec_fname, role_vec_fname, options): '''\ %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname> graph_fname: the graph file It can be a file contained edges per line (e.g., res/karate_club_edges.txt) or a pickled graph file. node_vec_fname: the output file for nodes' vectors path_vec_fname: the output file for meta-paths' vectors ''' print 'Load a HIN...' g = loader.load_a_HIN(graph_fname) model = MP2Vec(size=options.dim, window=options.window, neg=options.neg, num_processes=options.num_processes, alpha=options.alpha, is_no_circle_path=False, ) model.train(g, options.walk_num, options.walk_length ) model.dump_to_file(node_vec_fname, type_='node') model.dump_to_file(role_vec_fname, type_='role')
def main(graph_fname, node_vec_fname, path_vec_fname, options): '''\ %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname> graph_fname: the graph file It can be a file contained edges per line (e.g., res/karate_club_edges.txt) or a pickled graph file. node_vec_fname: the output file for nodes' vectors path_vec_fname: the output file for meta-paths' vectors ''' print 'Load a HIN...' g = loader.load_a_HIN(graph_fname) print 'Generate random walks...' # _, tmp_walk_fname = tempfile.mkstemp() tmp_walk_fname = "tmp_walk_fname.txt" print tmp_walk_fname with open(tmp_walk_fname, 'w') as f: for walk in g.random_walks(options.walk_num, options.walk_length): f.write('%s\n' % ' '.join(map(str, walk))) # _, tmp_node_vec_fname = tempfile.mkstemp() # _, tmp_path_vec_fname = tempfile.mkstemp() tmp_node_vec_fname = "tmp_node_vec_fname.txt" tmp_path_vec_fname = "tmp_path_vec_fname.txt" print tmp_node_vec_fname print tmp_path_vec_fname model = MP2Vec( size=options.dim, window=options.window, neg=options.neg, num_processes=options.num_processes, # iterations=i, alpha=options.alpha, same_w=True, normed=False, is_no_circle_path=False, ) neighbors = None if options.correct_neg: for id_ in g.graph: g._get_k_hop_neighborhood(id_, options.window) neighbors = g.k_hop_neighbors[options.window] model.train( g, tmp_walk_fname, g.class_nodes, k_hop_neighbors=neighbors, ) model.dump_to_file(tmp_node_vec_fname, type_='node') model.dump_to_file(tmp_path_vec_fname, type_='path') print 'Dump vectors...' output_node2vec(g, tmp_node_vec_fname, node_vec_fname) output_path2vec(g, tmp_path_vec_fname, path_vec_fname) return 0
def main(graph_fname, node_vec_fname, path_vec_fname, options): '''\ %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname> graph_fname: the graph file It can be a file contained edges per line (e.g., res/karate_club_edges.txt) or a pickled graph file. node_vec_fname: the output file for nodes' vectors path_vec_fname: the output file for meta-paths' vectors ''' print 'Load a HIN...' g = loader.load_a_HIN(graph_fname) print 'Generate random walks...' _, tmp_walk_fname = tempfile.mkstemp() with open(tmp_walk_fname, 'w') as f: for walk in g.random_walks(options.walk_num, options.walk_length): f.write('%s\n' % ' '.join(map(str, walk))) with open("unDiAPwalk.txt",'w') as f: for walk in g.random_walks(options.walk_num, options.walk_length): f.write('%s\n' % ' '.join(map(str, walk))) _, tmp_node_vec_fname = tempfile.mkstemp() _, tmp_path_vec_fname = tempfile.mkstemp() print 'Learn representations...' statement = ("model_c/bin/hin2vec.exe -size %d -train %s -alpha %f " "-output %s -output_mp %s -window %d -negative %d " "-threads %d -no_circle %d -sigmoid_reg %d " "" % (options.dim, tmp_walk_fname, options.alpha, tmp_node_vec_fname, tmp_path_vec_fname, options.window, options.neg, options.num_processes, 1-(options.allow_circle * 1), options.sigmoid_reg * 1)) print statement os.system(statement) print 'Dump vectors...' output_node2vec(g, tmp_node_vec_fname, node_vec_fname) output_path2vec(g, tmp_path_vec_fname, path_vec_fname) return 0
def main(graph_fname, vec_fname, output_fname): '''\ %prog [options] ''' g = loader.load_a_HIN(graph_fname) id2name = dict([(id_, name) for name, id_ in g.node2id.items()]) with open(vec_fname) as f: with open(output_fname, 'w') as fo: first = True for line in f: if first: first = False fo.write(line) continue tokens = line.split(' ', 1) name = id2name[int(tokens[0])] fo.write('%s %s' % (name, tokens[1])) return 0
def main(graph_fname, node_vec_fname, role_vec_fname, graphlet_vec_fname, options): '''\ %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname> <graphlet_vec_fname> graph_fname: the graph file It can be a file contained edges per line (e.g., res/karate_club_edges.txt) or a pickled graph file. node_vec_fname: the output file for nodes' vectors path_vec_fname: the output file for meta-paths' vectors ''' print 'Load a HIN...' g = loader.load_a_HIN(graph_fname) print len(g.graph) for c in g.class_nodes: print c, len(g.class_nodes[c]) print g.edge_class2id g.create_node_choices() id2classes = {} for class_, ids in g.class_nodes.items(): for id_ in ids: id2classes[id_] = class_ print 'Preprocess graphlet matcher...' to_freq = False if options.freq_fname is None or not os.path.exists(options.freq_fname): to_freq = True id2freq = dict(zip(g.graph.keys(), [0] * len(g.graph))) else: print 'load id2freq', options.freq_fname id2freq = cPickle.load(open(options.freq_fname)) to_matcher = False if options.matcher_fname is None or not os.path.exists( options.matcher_fname): to_matcher = True matcher = graphlet.GraphletMatcher() else: print 'load matcher ', options.matcher_fname matcher = cPickle.load(open(options.matcher_fname)) if to_freq or to_matcher: for walk in g.random_walks(1, 100): if to_matcher: for id2degrees in graphlet.complete_and_count_degrees( g, options.window, walk): matcher.get_graphlet(id2classes, id2degrees) if to_freq: for id_ in walk: id2freq[id_] += 1 if to_freq and options.freq_fname is not None: print 'dump id2freq', options.freq_fname cPickle.dump(id2freq, open(options.freq_fname, 'w')) if to_matcher and options.matcher_fname is not None: print 'dump matcher', options.matcher_fname cPickle.dump(matcher, open(options.matcher_fname, 'w')) print matcher.graphlets print 'graphlet:', len(matcher.graphlets) print 'roles:', matcher.rid_offset tmp_freq_fname = '/tmp/ms_freq.txt' with open(tmp_freq_fname, 'w') as f: for id_, freq in sorted(id2freq.items()): f.write('%d\n' % (freq)) print 'Generate training set' _, tmp_node_vec_fname = tempfile.mkstemp() _, tmp_role_vec_fname = tempfile.mkstemp() tmp_data_fname = options.training_fname to_generate = False if options.training_fname is None: _, tmp_data_fname = tempfile.mkstemp() print tmp_data_fname to_generate = True elif not os.path.exists(options.training_fname): to_generate = True if to_generate: graphlet.generate_training_set_to_file( g, matcher, id2classes, options.walk_length, options.window, tmp_data_fname, num_processes=options.num_processes) print 'Learn representations...' model = 'ms2vec' if options.mode: model = 'ms2vec_c2t' statement = ("model_c/bin/%s -size %d -node_count %d " "-role_count %d -graphlet_count %d -role_ratio %f " "-train %s -freq %s -alpha %f " "-output %s -output_role %s -output_graphlet %s " "-window %d -negative %d " "-threads %d -sigmoid_reg %d -iteration %d -equal %d" "" % (model, options.dim, max(g.graph) + 1, matcher.rid_offset, len(matcher.graphlets), options.role_ratio, tmp_data_fname, tmp_freq_fname, options.alpha, tmp_node_vec_fname, tmp_role_vec_fname, graphlet_vec_fname, options.window, options.neg, options.num_processes, options.sigmoid_reg * 1, options.iter, options.equal * 1)) print statement os.system(statement) output_node2vec(g, tmp_node_vec_fname, node_vec_fname) output_role2vec(matcher, tmp_role_vec_fname, role_vec_fname)
def main(graph_fname, node_vec_fname, path_vec_fname, options): #DONE: Get init parameters as arguements to program #DONE: num nodes, get length of graph #DONE: num rel get from len node_vocab/path_vocab, didnt work, just use node# for now #TODO(NEWEST) get working with batches and fix shape of objective function to work with them, pick a good cross entropy function, sigmoid? #TODO(NEWEST) get p_ into same format as mnist labels [0,1],[1,0] change p_ shape to [None, 2] print('Load a HIN...') g = loader.load_a_HIN(graph_fname) NUM_NODES = g.node_count() NUM_REL = g.node_count() print('Generate random walks...') _, tmp_walk_fname = tempfile.mkstemp() print('DEBUG: Saving random walks to file...') randomWalkFile = open('randWalks.txt', 'w') print(tmp_walk_fname) with open(tmp_walk_fname, 'w') as f: for walk in g.random_walks(options.walk_num, options.walk_length): f.write('%s\n' % ' '.join(map(str, walk))) randomWalkFile.write('%s\n' % ' '.join(map(str, walk))) _, tmp_node_vec_fname = tempfile.mkstemp() _, tmp_path_vec_fname = tempfile.mkstemp() # node_vocab = mp.NodeVocab.load_from_file(tmp_walk_fname) # path_vocab = mp.PathVocab.load_from_file(tmp_walk_fname,options.window) model = MP2Vec(size=options.dim, window=options.window, neg=options.neg, num_processes=options.num_processes, # iterations=i, alpha=options.alpha, same_w=True, normed=False, is_no_circle_path=False, ) neighbors = None if options.correct_neg: for id_ in g.graph: g._get_k_hop_neighborhood(id_, options.window) neighbors = g.k_hop_neighbors[options.window] model.train(g, tmp_walk_fname, g.class_nodes, k_hop_neighbors=neighbors, ) #TODO: store neg/pos data without using file #DONE: format neg data for tensorflow # Load training samples from file into list x_data = [] y_data = [] r_data = [] p_data = [] #TODO: Fix the append causes errors randomly, there is problem with generation of test data to file with open("pos_data.txt", "r") as pos: with open("neg_data.txt", "r") as neg: for l in pos: temp_pos = l.strip().split(",") # Read pos training data if (len(temp_pos) == 3): if temp_pos[0] is not None and temp_pos[1] is not None and temp_pos[2] is not None: x_data.append(int(temp_pos[0])) y_data.append(int(temp_pos[1])) r_data.append(int(temp_pos[2])) p_data.append(int(1)) # Read neg training data temp_neg = neg.readline().strip().split(",") for j in temp_neg: if (j is not None and j != ""): x_data.append(int(temp_pos[0])) y_data.append(int(j)) r_data.append(int(temp_pos[2])) p_data.append(int(0)) # Debugging test = open("read_data.txt", "w+") for i in range(len(x_data)): test.write(str(x_data[i]) + "," + str(y_data[i]) + "," + str(r_data[i]) + "," + str(p_data[i]) + "\n") #for i in range(0, len(x_data)): # print (x_data[i], y_data[i], r_data[i]) # Convert the data to numpy array x_np = np.array(x_data) y_np = np.array(y_data) r_np = np.array(r_data) p_np = np.array(p_data) # Convert the x,y,r into 1-hot numpy array x_onehot = np.zeros((len(x_data), NUM_NODES)) x_onehot[np.arange(len(x_data)), x_np] = 1 y_onehot = np.zeros((len(y_data), NUM_NODES)) y_onehot[np.arange(len(y_data)), y_np] = 1 r_onehot = np.zeros((len(r_data), NUM_NODES)) r_onehot[np.arange(len(r_data)), r_np] = 1 #DONE: Prepare positive training data to get form (x,y,r,L(x,y,r)) #DONE: Prepare negative samples (x'',y'',r'') for each positive entry #DONE: Generate bathes from training set #DONE: Get g_ val from training data # Input #x = tf.placeholder(tf.float32, [None, NUM_NODES]) #y = tf.placeholder(tf.float32, [None, NUM_NODES]) #r = tf.placeholder(tf.float32, [None, NUM_REL]) p_ = tf.placeholder(tf.float32, [None]) x_id = tf.placeholder(tf.int32, [None]) y_id = tf.placeholder(tf.int32, [None]) r_id = tf.placeholder(tf.int32, [None]) #DONE: instantiate learned weights with rand uniform dist (mp2vec_s.py) #TODO: Case where Wx = Wy # Learned weights Wx = tf.Variable(np.random.uniform(low=-0.5/options.dim, high=0.5/options.dim, size=(NUM_NODES, options.dim)).astype(np.float32)) Wy = tf.Variable(np.random.uniform(low=-0.5 / options.dim, high=0.5 / options.dim, size=(NUM_NODES, options.dim)).astype(np.float32)) Wr = tf.Variable(np.random.uniform(low=0.0/options.dim, high=1.0/options.dim, size=(NUM_NODES, options.dim)).astype(np.float32)) # Aggregate Vectors #Wx_x = tf.matmul(x, Wx) #Wy_y = tf.matmul(y, Wy) Wx_x_id = tf.nn.embedding_lookup(Wx, x_id) Wy_y_id = tf.nn.embedding_lookup(Wx, y_id) Wr_r_id = tf.round(tf.nn.sigmoid(tf.nn.embedding_lookup(Wr, r_id))) #TODO: Make binary step default and option to use sigmoid # Regularization of Wr (Binary step by rounding sigmoid) #Wr_r = tf.round(tf.nn.sigmoid(tf.matmul(r, Wr))) # Regularization of Wr (just sigmoid) #Wr_r = tf.nn.sigmoid(tf.multiply(tWr, r)) #DONE: Find a way to do element-wise mult with diff size tensors (Wr_r is diff size), make one-hot # Hidden Layer (element-wise mult) #h = tf.multiply(tf.multiply(Wx_x, Wy_y), Wr_r) h = tf.multiply(tf.multiply(Wx_x_id, Wy_y_id), Wr_r_id) # Output (sigmoid of element summation), reduce the sum along dim and keep [None] dim #p = tf.nn.sigmoid(tf.reduce_sum(h, 1)) p = tf.reduce_sum(h, 1, keep_dims=False) #TODO: Implement obj version for binary step # Objective Function #objective = tf.cond(tf.equal(p_, 1), lambda: p, lambda: 1-p) objective = tf.nn.sigmoid_cross_entropy_with_logits(labels=p_, logits=p) #objective = tf.nn.softmax_cross_entropy_with_logits(labels=p_, logits=p) #objective = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=p_, logits=p) #DONE: Take negative of obj or not? # Train step (want to minimize negative of objective) train_step = tf.train.GradientDescentOptimizer(options.alpha).minimize(objective) #DONE: calculate accuracy correctly # Accuracy correct_prediction = tf.equal(tf.round(tf.sigmoid(p)), p_) u = tf.cast(correct_prediction,tf.float32) accuracy = tf.reduce_mean(u) # Training sess = tf.InteractiveSession() tf.global_variables_initializer().run() #TODO: integreate -i iterations option #TODO: train all data one at a time? or train random data points at a time? step = 50 for j in xrange(0,1): print ("epoch " + str(j) + " -----------------") for i in xrange(0, len(x_data), step): #x_val = x_onehot[i:i+step] #y_val = y_onehot[i:i+step] #r_val = r_onehot[i:i+step] p__val = p_np[i:i+step] x_id_val = x_data[i:i+step] y_id_val = y_data[i:i+step] r_id_val = r_data[i:i+step] #print(str(x_id_val) + str(y_id_val) + str(r_id_val) + str(p__val)) #raw_input() ''' Wx_row = sess.run(Wx_x_id, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) Wy_row = sess.run(Wy_y_id, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) Wr_row = sess.run(Wr_r_id, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) p_res = sess.run(p, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) print(Wx_row) print(Wy_row) print(Wr_row) print(p_res) raw_input() ''' sess.run(train_step, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) train_accuracy = sess.run(accuracy, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) print('step %d, training accuracy %f' % (i, train_accuracy)) Wxx = sess.run(Wx, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) np.savetxt('node_vec_temp.txt', Wxx, delimiter=' ') ''' Wx_row = sess.run(Wx_x_id, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) Wy_row = sess.run(Wy_y_id, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) Wr_row = sess.run(Wr_r_id, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) p_res = sess.run(p, feed_dict={p_: p__val, x_id: x_id_val, y_id: y_id_val, r_id: r_id_val}) print(Wx_row) print(Wy_row) print(Wr_row) print(p_res) raw_input() ''' #Output vectors to file lines = [] lines.append(str(NUM_NODES) + " " + str(options.dim) + "\n") tmpLines = [] with open("node_vec_temp.txt", "r") as f: for l in f: tmpLines.append(l) id2name = dict([(id_, name) for name, id_ in g.node2id.items()]) for i in xrange(0, NUM_NODES): lines.append(str(id2name[i]) + " " + tmpLines[i]) with open("node_vec.txt", "w+") as out: for l in lines: out.write(l)