def train():
    logging.info("Preparing summarization data.")
    docid, sumid, doc_dict, sum_dict = \
        data_util.load_data(
            FLAGS.data_dir + "/train/train.article.txt",
            FLAGS.data_dir + "/train/train.title.txt",
            FLAGS.data_dir + "/doc_dict.txt",
            FLAGS.data_dir + "/sum_dict.txt",
            FLAGS.doc_vocab_size, FLAGS.sum_vocab_size)

    val_docid, val_sumid = \
        data_util.load_valid_data(
            FLAGS.data_dir + "/train/valid.article.filter.txt",
            FLAGS.data_dir + "/train/valid.title.filter.txt",
            doc_dict, sum_dict)

    with tf.Session() as sess:
        # Create model.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        train_writer = tf.summary.FileWriter(FLAGS.tfboard, sess.graph)
        model = create_model(sess, False)

        # Read data into buckets and compute their sizes.
        logging.info("Create buckets.")
        dev_set = create_bucket(val_docid, val_sumid)
        train_set = create_bucket(docid, sumid)

        train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))
        ]

        for (s_size, t_size), nsample in zip(_buckets, train_bucket_sizes):
            logging.info("Train set bucket ({}, {}) has {} samples.".format(
                s_size, t_size, nsample))

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = sess.run(model.global_step)

        while current_step <= FLAGS.max_iter:
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, encoder_len, decoder_len = \
                model.get_batch(train_set, bucket_id)
            step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                      encoder_len, decoder_len, False,
                                      train_writer)

            step_time += (time.time() - start_time) / \
                FLAGS.steps_per_validation
            loss += step_loss * FLAGS.batch_size / np.sum(decoder_len) \
                / FLAGS.steps_per_validation
            current_step += 1

            # Once in a while, we save checkpoint.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)

            # Once in a while, we print statistics and run evals.
            if current_step % FLAGS.steps_per_validation == 0:
                # Print statistics for the previous epoch.
                perplexity = np.exp(float(loss))
                logging.info("global step %d step-time %.2f ppl %.2f" %
                             (model.global_step.eval(), step_time, perplexity))

                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in range(len(_buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        logging.info("  eval: empty bucket %d" % (bucket_id))
                        continue
                    encoder_inputs, decoder_inputs, encoder_len, decoder_len =\
                        model.get_batch(dev_set, bucket_id)
                    eval_loss, _ = model.step(sess, encoder_inputs,
                                              decoder_inputs, encoder_len,
                                              decoder_len, True)
                    eval_loss = eval_loss * FLAGS.batch_size \
                        / np.sum(decoder_len)
                    eval_ppx = np.exp(float(eval_loss))
                    logging.info("  eval: bucket %d ppl %.2f" %
                                 (bucket_id, eval_ppx))
                sys.stdout.flush()
Esempio n. 2
0
sum_file = "data/modified_train_abstract.txt"
vocab_file = "data/vocab"
checkpoint_dir = "./save/quasi/checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "baseline")
dev_doc_file = "data/val_article.txt"
dev_sum_file = "data/val_abstract.txt"
# load source and target data
docs, sums, vocab = load_data(doc_file,
                              sum_file,
                              vocab_file,
                              max_vocab_size,
                              debug=debug,
                              max_num_tokens=max_num_tokens)
dev_docs, dev_sums = load_valid_data(dev_doc_file,
                                     dev_sum_file,
                                     vocab,
                                     max_num_tokens,
                                     debug=debug)
vocab_size = vocab.size()


# self, vocab_size, embedding_size, state_size, num_layers,
#                  decoder_vocab_size, attention_hidden_size, mode, beam_depth,
#                  learning_rate, max_iter=100, attention_mode="Bahdanau"):
def load_glove(glove_file, vocab, embedding_size):
    print("load pretrained glove from : {}".format(glove_file))
    f = open(glove_file, "r", encoding="utf-8")
    lines = f.readlines()
    embedding = np.random.uniform(-0.25, 0.25, (vocab_size, embedding_size))
    for line in lines:
        tokens = line.strip().split()
Esempio n. 3
0
def train():
    logging.info("Preparing summarization data.")
    docid, sumid, doc_dict, sum_dict, hidden_label= \
        data_util.load_data(
            FLAGS.data_dir + "/train.48615.diff",
            FLAGS.data_dir + "/train.48615.msg",
            FLAGS.data_dir + "/doc_dict.txt",
            FLAGS.data_dir + "/sum_dict.txt",
            FLAGS.doc_vocab_size, FLAGS.sum_vocab_size)

    val_docid, val_sumid, val_hidd_label = \
        data_util.load_valid_data(
            FLAGS.data_dir + "/valid.3000.diff",
            FLAGS.data_dir + "/valid.3000.msg",
            doc_dict, sum_dict)

    with tf.Session() as sess:
        # Create model.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        train_writer = tf.summary.FileWriter(FLAGS.tfboard, sess.graph)
        model = create_model(sess, False)

        # Read data into buckets and compute their sizes.
        logging.info("Create buckets.")
        dev_set = create_bucket(val_docid, val_sumid, val_hidd_label)
        train_set = create_bucket(docid, sumid, hidden_label)

        train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))
        ]

        for (s_size, t_size, _), nsample in zip(_buckets, train_bucket_sizes):
            logging.info("Train set bucket ({}, {}) has {} samples.".format(
                s_size, t_size, nsample))

        # This is the training loop.
        step_time, loss = 0.0, 0.0

        current_step = sess.run(model.global_step)

        while current_step < FLAGS.max_iter:
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \
                data_util.get_batch(train_set, _buckets, bucket_id, FLAGS.batch_size, False,0)
            step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                      encoder_len, decoder_len, False,
                                      train_writer)

            step_time += (time.time() - start_time) / \
                FLAGS.steps_per_validation
            loss += step_loss * FLAGS.batch_size / np.sum(decoder_len) \
                / FLAGS.steps_per_validation
            current_step += 1

            # Once in a while, we save checkpoint.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)

            # Once in a while, we print statistics and run evals.
            if current_step % FLAGS.steps_per_validation == 0:
                # Print statistics for the previous epoch.
                perplexity = np.exp(float(loss))
                logging.info("global step %d step-time %.2f ppl %.2f" %
                             (model.global_step.eval(), step_time, perplexity))

                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in range(len(_buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        logging.info("  eval: empty bucket %d" % (bucket_id))
                        continue
                    encoder_inputs, decoder_inputs,  encoder_len, decoder_len, class_output, class_len  =\
                        data_util.get_batch(dev_set, _buckets, bucket_id, FLAGS.batch_size, False, 0)
                    #cl_eval_loss, _ = class_model.step(sess, class_input, class_output, class_len, True)
                    eval_loss, _, _ = model.step(sess, encoder_inputs,
                                                 decoder_inputs, encoder_len,
                                                 decoder_len, True)
                    eval_loss = eval_loss * FLAGS.batch_size \
                        / np.sum(decoder_len)
                    eval_ppx = np.exp(float(eval_loss))
                    logging.info("  eval: bucket %d ppl %.2f" %
                                 (bucket_id, eval_ppx))
                sys.stdout.flush()

        #Get Encoder outputs
        batchidx = 0
        final_inputs = []
        final_outputs = []
        final_len = []
        while batchidx + FLAGS.batch_size <= train_bucket_sizes[0]:
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \
                data_util.get_batch(train_set, _buckets, bucket_id, FLAGS.batch_size, True, batchidx)

            _, _, enc_outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

            enc_outputs = data_util.add_pad_for_hidden(enc_outputs,
                                                       _buckets[0][0])

            final_inputs.append(enc_outputs)
            final_outputs.append(class_output)
            final_len.append(class_len)

            batchidx += FLAGS.batch_size

        final_inputs = np.asarray(final_inputs)
        final_inputs = np.concatenate(final_inputs, 0)
        final_outputs = np.asarray(final_outputs)
        final_outputs = np.concatenate(final_outputs, 0)
        final_len = np.asarray(final_len)
        final_len = np.concatenate(final_len, 0)
        print(final_inputs.shape, final_outputs.shape, final_len.shape)

        #Hidden classifier
        class_model = create_class_model(sess, False)
        classification_curr_step = sess.run(class_model.global_step)
        i = 0
        while classification_curr_step <= FLAGS.class_max_iter:

            _, step_loss, output = class_model.step(sess,
                                                    final_inputs[i:(i + 160)],
                                                    final_outputs[i:(i + 160)],
                                                    final_len[i:(i + 160)],
                                                    False)
            classification_curr_step += 1

            clipped = np.array(output > 0.5, dtype=np.int)
            #print("i", i)
            #print("clfcurrstep",classification_curr_step)
            #print("clipped", clipped.flatten())
            #print("final_outputs", final_outputs[i:(i+160)].flatten())
            tn, fp, fn, tp = confusion_matrix(
                final_outputs[i:(i + 160)].flatten(),
                clipped.flatten()).ravel()
            if (classification_curr_step % 40 == 0):
                print("Train Precision", tp / (tp + fp + 0.1))
                print("Train Accuracy", (tp + tn) / (tp + fp + tn + fn))
            if (i + 160 == len(final_len)):
                i = 0
            else:
                i += 160

            # Once in a while, we save checkpoint.
            if classification_curr_step % FLAGS.steps_per_checkpoint == 0:
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.class_train_dir,
                                               "class_model.ckpt")
                class_model.saver.save(sess,
                                       checkpoint_path,
                                       global_step=class_model.global_step)

        print("test_file", FLAGS.test_file)
        docs, data = data_util.load_test_data(FLAGS.test_file, doc_dict)

        #test

        # Create model and load parameters.
        '''
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))

        result = []
        for idx, token_ids in enumerate(data):

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\
                data_util.get_batch(
                    {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                loss, outputs = model.step(sess,
                    encoder_inputs, decoder_inputs,
                    encoder_len, decoder_len, False)

                outputs = [np.argmax(item) for item in outputs[0]]
            else:
                outputs = model.step_beam(
                    sess, encoder_inputs, encoder_len, geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            if data_util.ID_EOS in outputs:
                outputs = outputs[:outputs.index(data_util.ID_EOS)]
            gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1]))
            gen_sum = data_util.sen_postprocess(gen_sum)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))
        '''

        #Get Encoder outputs
        docid, sumid, doc_dict, sum_dict, hidden_label= \
        data_util.load_data(
            FLAGS.data_dir + "/test.1981.diff.txt",
            FLAGS.data_dir + "/test.1981.msg.txt",
            FLAGS.data_dir + "/doc_dict.txt",
            FLAGS.data_dir + "/sum_dict.txt",
            FLAGS.doc_vocab_size, FLAGS.sum_vocab_size)
        test_set = create_bucket(docid, sumid, hidden_label)
        test_bucket_sizes = [len(test_set[b]) for b in range(len(_buckets))]
        test_total_size = float(sum(test_bucket_sizes))
        test_buckets_scale = [
            sum(test_bucket_sizes[:i + 1]) / test_total_size
            for i in range(len(test_bucket_sizes))
        ]
        batchidx = 0
        final_inputs = []
        final_outputs = []
        final_len = []
        #data.shape == (1, 158, 3) so I changed FLAGS.batch_size
        FLAGS.batch_size = 158
        while batchidx + FLAGS.batch_size <= len(data):
            #bucket_id = (i for i in range(len(test_buckets_scale))
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \
                data_util.get_batch(test_set, _buckets, bucket_id, FLAGS.batch_size, True, batchidx)

            _, _, enc_outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

            enc_outputs = data_util.add_pad_for_hidden(enc_outputs,
                                                       _buckets[0][0])

            final_inputs.append(enc_outputs)
            final_outputs.append(class_output)
            final_len.append(class_len)

            batchidx += 1

        final_inputs = np.asarray(final_inputs)
        final_inputs = np.concatenate(final_inputs, 0)
        final_outputs = np.asarray(final_outputs)
        final_outputs = np.concatenate(final_outputs, 0)
        final_len = np.asarray(final_len)
        final_len = np.concatenate(final_len, 0)
        print(final_inputs.shape, final_outputs.shape, final_len.shape)

        #Hidden classifier
        step_loss, output = class_model.step(sess, final_inputs[:],
                                             final_outputs[:], final_len[:],
                                             True)

        clipped = np.array(output > 0.5, dtype=np.int)
        tn, fp, fn, tp = confusion_matrix(final_outputs[:].flatten(),
                                          clipped.flatten()).ravel()
        #with open('data/test.1981.msg.txt')as reader:
        #    testmsg=[]
        #    for i in range(1981):
        #        testmsg.append(reader.readline())
        #sums  = list(map(lambda x: x.split(), testmsg))
        #labels = data_util.hidden_label_gen(FLAGS.test_file, sums)
        #tn, fp, fn, tp = confusion_matrix(labels.flatten(), clipped.flatten())
        print("Test Precision : ", tp / (tp + fp + 0.1))
        print("Test Accuracy", (tp + tn) / (tp + fp + tn + fn))
        with open(FLAGS.test_output, "w") as f:
            for idx in range(1981):
                for j in range(len(docs[idx])):
                    if clipped[idx][j] == 1:
                        print("Recommended identifier: " + docs[idx][j] + " ",
                              file=f)
                print("\n", file=f)