Ejemplo n.º 1
0
    def scores(self, data_dir, fquery, freply, fqvocab, frvocab, init=False):
        if not init:
            self.init_model()

        queries = data_helpers.load_file(data_dir, fquery)
        replies = data_helpers.load_file(data_dir, freply)
        data_size = len(queries)

        qvocab = data_helpers.load_vocab(data_dir, fqvocab)
        rvocab = data_helpers.load_vocab(data_dir, frvocab)

        scores=[]
        with self.session.as_default():
            for query, reply in zip(queries, replies):
                ql, qids = data_helpers.transform_to_id(qvocab, query,
                        self.qmax_length)
                rl, rids = data_helpers.transform_to_id(rvocab, reply,
                        self.rmax_length)
                feed_dict = self.make_input_feed([qids], [ql], [rids], [rl], training=False)
                score = self.session.run(self.pos_score, feed_dict)
                scores.append(score[0])
            """ Debug
            for i, s in enumerate(scores):
                print(i,s)
            """
        return scores
 def get_scores(self, reference_file, response_file):
     reference = data_helpers.load_file(reference_file)
     response = data_helpers.load_file(response_file)
     ret = []
     for t, g in zip(reference, response):
         ret.append(self._score(t, g))
     return ret
Ejemplo n.º 3
0
 def scores(self, data_dir, fgroundtruth, fgenerated):
     groundtruth = data_helpers.load_file(data_dir, fgroundtruth)
     generated = data_helpers.load_file(data_dir, fgenerated)
     ret = []
     for t, g in zip(groundtruth, generated):
         ret.append(self.score(t, g))
     return ret
    def get_scores(self, query_file, reply_file, query_vocab_file, reply_vocab_file, init=False):
        if not init:
            self.init_model()

        queries = data_helpers.load_file(query_file)
        replies = data_helpers.load_file(reply_file)

        query_vocab = data_helpers.load_vocab(query_vocab_file)
        reply_vocab = data_helpers.load_vocab(reply_vocab_file)

        scores = []
        logger.info('looping over query-reply pairs')
        with self.session.as_default():
            for query, reply in zip(queries, replies):
                q_len, q_ids = data_helpers.transform_to_id(query_vocab, query, self.query_max_length)
                r_len, r_ids = data_helpers.transform_to_id(reply_vocab, reply, self.reply_max_length)
                feed_dict = self.make_input_feed([q_ids], [q_len], [r_ids], [r_len], training=False)
                # When training=False there is no neg_score, so as pos_score.
                score = self.session.run(self.score, feed_dict)
                score = float(score[0])
                scores.append(score)
        return scores
Ejemplo n.º 5
0
    def train(self,
              data_dir,
              fquery,
              freply,
              batch_size=128,
              steps_per_checkpoint=100):
        queries = data_helpers.load_data(data_dir, fquery, self.qmax_length)
        replies = data_helpers.load_data(data_dir, freply, self.rmax_length)

        validation_queries = data_helpers.load_data("data/validation_ADEM",
                                                    "queries.txt",
                                                    self.qmax_length)
        validation_replies = data_helpers.load_data("data/validation_ADEM",
                                                    "hred_replies.txt",
                                                    self.rmax_length)
        scores = data_helpers.load_file("data/validation_ADEM",
                                        "hred_scores.txt")
        scores = [float(score) for score in scores]
        #TODO - calculate MSE against these scores?

        data_size = len(queries)
        print_score = tf.print(self.score)
        with self.session.as_default():
            self.init_model()

            checkpoint_path = os.path.join(self.train_dir, "unref.model")
            loss = 0.0
            validation_loss = 0.0
            best_validation_loss = 1000
            prev_losses = [1.0]
            impatience = 0.0
            while True:
                step, l = self.train_step(queries, replies, data_size,
                                          batch_size)
                # KEVIN DOES THIS TRAIN THE MODEL ON THE VALIDATION SET :(
                _, validation_l = self.get_validation_loss(
                    validation_queries, validation_replies,
                    len(validation_queries), batch_size)

                loss += l
                validation_loss += validation_l
                print(validation_loss)
                # save checkpoint
                if step % steps_per_checkpoint == 0:
                    loss /= steps_per_checkpoint
                    validation_loss /= steps_per_checkpoint
                    print ("global_step %d, loss %f, learning rate %f"  \
                            %(step, loss, self.learning_rate.eval()))

                    if validation_loss < best_validation_loss:
                        best_validation_loss = validation_loss
                        impatience = 0.0
                        self.saver.save(self.session,
                                        checkpoint_path,
                                        global_step=self.global_step)
                    else:
                        impatience += 1

                    print("Validation loss is %f. The best loss thus far has been %f. Impatience: %f" \
                        %(validation_loss, best_validation_loss, impatience))

                    if loss > max(prev_losses):
                        self.session.run(self.learning_rate_decay_op)
                    prev_losses = (prev_losses + [loss])[-5:]
                    loss = 0.0

                    self.log_writer.add_summary(self.summary, step)

                    #                    """ Debug
                    query_batch, query_sizes, idx = self.get_batch(
                        queries, data_size, 10)
                    reply_batch, reply_sizes, idx = self.get_batch(
                        replies, data_size, 10, idx)
                    input_feed = self.make_input_feed(query_batch,
                                                      query_sizes,
                                                      reply_batch,
                                                      reply_sizes,
                                                      training=False)
                    score, tests = self.session.run(
                        [self.pos_score, self.test], input_feed)
                    print('-------------')
                    for s, t in zip(score[:10], tests[:10]):
                        print(s, t)