def new_infer_vector(self, graph_wl_path, batch_size=1024, epochs=10): """ Slightly new version faster version :param: graph_wl_path: a .g2v<depth> graph document :param: batch_size = 256: the batch size for the training :param: epochs=5: number of steps to train the network to get the inferred vector 5 is the default based on the paper """ # make a new corpus with the supplied file newCorpus = Corpus(corpus_dir=self.corpus.corpus_dir, extension=self.corpus.extension) newCorpus.scan_and_load_corpus() newCorpus.add_file(graph_wl_path) new_graph_id = newCorpus._graph_name_to_id_map[graph_wl_path] infer_graph, batch_inputs, batch_labels, inf_normalized_embeddings, inf_loss, optimizer, inf_graph_embeddings = self.infer_initial( newCorpus) # use the new corpus to train the network for a bit and spit out the embeddings with tf.Session( graph=infer_graph, config=tf.ConfigProto(allow_soft_placement=False)) as inf_sess: init = tf.global_variables_initializer() inf_sess.run(init) loss = 0 for i in range(epochs): while newCorpus.epoch_flag == False: batch_data, batch_labelss = newCorpus.generate_batch_from_file( batch_size) # get the (target, context) wordID pairs feed_dict = { batch_inputs: batch_data, batch_labels: batch_labelss } _, loss_val = inf_sess.run([optimizer, inf_loss], feed_dict=feed_dict) loss += loss_val newCorpus.epoch_flag = False loss = 0 final_embeddings = inf_normalized_embeddings.eval() return final_embeddings[new_graph_id]
def infer_vector(self, graph_wl_path, batch_size=256, epochs=10): """ Given a graph_document file, we infer its embedding based on the techniques in Le and Mikolov 2014, We will assume that this truly is an unseen example and not one already considered in the corpus :param: graph_wl_path: a .g2v<depth> graph document :param: batch_size = 256: the batch size for the training :param: epochs=5: number of steps to train the network to get the inferred vector 5 is the default based on the paper """ # make a new corpus with the supplied file newCorpus = Corpus(corpus_dir=self.corpus.corpus_dir, extension=self.corpus.extension) newCorpus.scan_and_load_corpus() newCorpus.add_file(graph_wl_path) new_graph_id = newCorpus._graph_name_to_id_map[graph_wl_path] # infer_graph = tf.Graph() # with infer_graph.as_default(): batch_inputs = tf.placeholder(tf.int32, shape=([ None, ])) batch_labels = tf.placeholder(tf.int64, shape=([None, 1])) old_graph_embeddings = tf.constant( self.graph_embeddings_for_normals ) # the trained embeddings from earlier new_embedding = tf.Variable( tf.random_uniform([1, self.embedding_size], -0.5 / self.embedding_size, 0.5 / self.embedding_size)) inf_graph_embeddings = tf.concat([old_graph_embeddings, new_embedding], 0) batch_graph_embeddings = tf.nn.embedding_lookup( inf_graph_embeddings, batch_inputs) # hidden layer weights = tf.Variable( tf.truncated_normal([newCorpus.num_subgraphs, self.embedding_size], stddev=1.0 / math.sqrt(self.embedding_size))) biases = tf.Variable(tf.zeros(newCorpus.num_subgraphs)) # negative sampling inf_loss = tf.reduce_mean( tf.nn.nce_loss( weights=weights, biases=biases, labels=batch_labels, inputs=batch_graph_embeddings, num_sampled=self.num_negsample, num_classes=newCorpus.num_subgraphs, sampled_values=tf.nn.fixed_unigram_candidate_sampler( true_classes=batch_labels, num_true=1, num_sampled=self.num_negsample, unique=True, range_max=newCorpus.num_subgraphs, distortion=0.75, unigrams=newCorpus.subgraph_id_freq_map_as_list) # unigrams = self.corpus.subgraph_id_freq_map_as_list) # original )) global_step = tf.Variable( 0, trainable=False ) # the number of steps we have performed, we make sure not to change this. learning_rate = tf.train.exponential_decay( self.learning_rate, global_step, 100000, 0.96, staircase=True ) # linear decay, we can make this a bit more exciting learning_rate = tf.maximum( learning_rate, 0.001 ) # make sure the learning rate cannot go below 0.001 to ensure at least a minimal learning optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize( inf_loss, global_step=global_step) norm = tf.sqrt( tf.reduce_mean(tf.square(inf_graph_embeddings), 1, keep_dims=True)) inf_normalized_embeddings = inf_graph_embeddings / norm # use the new corpus to train the network for a bit and spit out the embeddings with tf.Session(config=tf.ConfigProto( allow_soft_placement=False)) as inf_sess: init = tf.global_variables_initializer() inf_sess.run(init) loss = 0 for i in range(epochs): while newCorpus.epoch_flag == False: batch_data, batch_labelss = newCorpus.generate_batch_from_file( batch_size) # get the (target, context) wordID pairs feed_dict = { batch_inputs: batch_data, batch_labels: batch_labelss } _, loss_val = inf_sess.run([optimizer, inf_loss], feed_dict=feed_dict) loss += loss_val newCorpus.epoch_flag = False loss = 0 final_embeddings = inf_normalized_embeddings.eval() return final_embeddings[new_graph_id]