Esempio n. 1
0
    def new_infer_vector(self, graph_wl_path, batch_size=1024, epochs=10):
        """
		Slightly new version faster version

		:param: graph_wl_path: a .g2v<depth> graph document
		:param: batch_size = 256: the batch size for the training
		:param: epochs=5: number of steps to train the network to get the inferred vector 5 is the default based on the paper
		"""

        # make a new corpus with the supplied file
        newCorpus = Corpus(corpus_dir=self.corpus.corpus_dir,
                           extension=self.corpus.extension)
        newCorpus.scan_and_load_corpus()
        newCorpus.add_file(graph_wl_path)
        new_graph_id = newCorpus._graph_name_to_id_map[graph_wl_path]

        infer_graph, batch_inputs, batch_labels, inf_normalized_embeddings, inf_loss, optimizer, inf_graph_embeddings = self.infer_initial(
            newCorpus)

        # use the new corpus to train the network for a bit and spit out the embeddings
        with tf.Session(
                graph=infer_graph,
                config=tf.ConfigProto(allow_soft_placement=False)) as inf_sess:
            init = tf.global_variables_initializer()
            inf_sess.run(init)

            loss = 0
            for i in range(epochs):
                while newCorpus.epoch_flag == False:
                    batch_data, batch_labelss = newCorpus.generate_batch_from_file(
                        batch_size)  # get the (target, context) wordID pairs

                    feed_dict = {
                        batch_inputs: batch_data,
                        batch_labels: batch_labelss
                    }
                    _, loss_val = inf_sess.run([optimizer, inf_loss],
                                               feed_dict=feed_dict)
                    loss += loss_val

                newCorpus.epoch_flag = False
                loss = 0

            final_embeddings = inf_normalized_embeddings.eval()
        return final_embeddings[new_graph_id]
Esempio n. 2
0
    def infer_vector(self, graph_wl_path, batch_size=256, epochs=10):
        """
		Given a graph_document file, we infer its embedding based on the techniques in Le and Mikolov 2014, 

		We will assume that this truly is an unseen example and not one already considered in the corpus

		:param: graph_wl_path: a .g2v<depth> graph document
		:param: batch_size = 256: the batch size for the training
		:param: epochs=5: number of steps to train the network to get the inferred vector 5 is the default based on the paper
		"""

        # make a new corpus with the supplied file
        newCorpus = Corpus(corpus_dir=self.corpus.corpus_dir,
                           extension=self.corpus.extension)
        newCorpus.scan_and_load_corpus()
        newCorpus.add_file(graph_wl_path)
        new_graph_id = newCorpus._graph_name_to_id_map[graph_wl_path]

        # infer_graph = tf.Graph()
        # with infer_graph.as_default():
        batch_inputs = tf.placeholder(tf.int32, shape=([
            None,
        ]))
        batch_labels = tf.placeholder(tf.int64, shape=([None, 1]))

        old_graph_embeddings = tf.constant(
            self.graph_embeddings_for_normals
        )  # the trained embeddings from earlier
        new_embedding = tf.Variable(
            tf.random_uniform([1, self.embedding_size],
                              -0.5 / self.embedding_size,
                              0.5 / self.embedding_size))
        inf_graph_embeddings = tf.concat([old_graph_embeddings, new_embedding],
                                         0)

        batch_graph_embeddings = tf.nn.embedding_lookup(
            inf_graph_embeddings, batch_inputs)  # hidden layer

        weights = tf.Variable(
            tf.truncated_normal([newCorpus.num_subgraphs, self.embedding_size],
                                stddev=1.0 / math.sqrt(self.embedding_size)))
        biases = tf.Variable(tf.zeros(newCorpus.num_subgraphs))

        # negative sampling
        inf_loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=weights,
                biases=biases,
                labels=batch_labels,
                inputs=batch_graph_embeddings,
                num_sampled=self.num_negsample,
                num_classes=newCorpus.num_subgraphs,
                sampled_values=tf.nn.fixed_unigram_candidate_sampler(
                    true_classes=batch_labels,
                    num_true=1,
                    num_sampled=self.num_negsample,
                    unique=True,
                    range_max=newCorpus.num_subgraphs,
                    distortion=0.75,
                    unigrams=newCorpus.subgraph_id_freq_map_as_list)
                # unigrams = self.corpus.subgraph_id_freq_map_as_list) # original
            ))

        global_step = tf.Variable(
            0, trainable=False
        )  # the number of steps we have performed, we make sure not to change this.
        learning_rate = tf.train.exponential_decay(
            self.learning_rate, global_step, 100000, 0.96, staircase=True
        )  # linear decay, we can make this a bit more exciting
        learning_rate = tf.maximum(
            learning_rate, 0.001
        )  # make sure the learning rate cannot go below 0.001 to ensure at least a minimal learning

        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(
            inf_loss, global_step=global_step)

        norm = tf.sqrt(
            tf.reduce_mean(tf.square(inf_graph_embeddings), 1, keep_dims=True))
        inf_normalized_embeddings = inf_graph_embeddings / norm

        # use the new corpus to train the network for a bit and spit out the embeddings
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=False)) as inf_sess:
            init = tf.global_variables_initializer()
            inf_sess.run(init)

            loss = 0
            for i in range(epochs):
                while newCorpus.epoch_flag == False:
                    batch_data, batch_labelss = newCorpus.generate_batch_from_file(
                        batch_size)  # get the (target, context) wordID pairs

                    feed_dict = {
                        batch_inputs: batch_data,
                        batch_labels: batch_labelss
                    }
                    _, loss_val = inf_sess.run([optimizer, inf_loss],
                                               feed_dict=feed_dict)
                    loss += loss_val

                newCorpus.epoch_flag = False
                loss = 0

            final_embeddings = inf_normalized_embeddings.eval()
        return final_embeddings[new_graph_id]