Esempio n. 1
0
def decode():
    '''Load dictionaries'''
    # Load vocabularies.
    print(os.getcwd())
    doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt")
    sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt")
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
        print("Loading testing data")
    data = data_util.load_test_data(FLAGS.test_file, doc_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, True)

        result = []
        for idx, token_ids in enumerate(data):

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, encoder_len, decoder_len =\
                model.get_batch(
                    {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS])]}, 0)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                print('ran code')
                loss, outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

                outputs = [np.argmax(item) for item in outputs[0]]

            else:
                outputs = model.step_beam(sess,
                                          encoder_inputs,
                                          encoder_len,
                                          geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            if data_util.ID_EOS in outputs:
                outputs = outputs[:outputs.index(data_util.ID_EOS)]
            gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1]))
            print(gen_sum)
            gen_sum = data_util.sen_postprocess(gen_sum)
            print(gen_sum)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))
        with open(FLAGS.test_output, "w") as f:
            for item in result:
                print(item, file=f)
Esempio n. 2
0
def decode():
    # Load vocabularies.
    doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt")
    sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt")
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
    data = data_util.load_test_data(FLAGS.data_dir + "/" + FLAGS.test_file,
                                    doc_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        # create reverse table
        reverse_table = tf.contrib.lookup.index_to_string_table_from_file(
            vocabulary_file=FLAGS.data_dir + "/sum_ordered_words.txt",
            default_value="<UNK>")
        reverse_table.init.run()
        model = create_model(sess, reverse_table, is_training=False)
        result = []
        for idx, token_ids in enumerate(data):
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, encoder_len, decoder_len = model.get_batch(
                {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS])]}, 0)
            # repeat
            if encoder_inputs.shape[0] == 1:
                encoder_inputs = np.repeat(encoder_inputs,
                                           FLAGS.batch_size,
                                           axis=0)
                encoder_len = np.repeat(encoder_len, FLAGS.batch_size, axis=0)
            # outputs = [batch_size,length]
            step, outputs = model.inference(sess, encoder_inputs, encoder_len)
            # If there is an EOS symbol in outputs, cut them at that point.
            target_output = [item[0].decode() for item in outputs]
            if data_util.MARK_EOS in target_output:
                target_output = target_output[:target_output.index(data_util.
                                                                   MARK_EOS)]
            gen_sum = " ".join(target_output)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))
        with open(FLAGS.test_output, "w") as f:
            for item in result:
                print(item, file=f)
Esempio n. 3
0
zoneout = 0.0
filter_width = 3
embedding_size = 300
num_layers = 1
summary_len = 100
attention_hidden_size = 100
beam_depth = 5
state_size = 120
mode = "test"
doc_file = "data/test_article.txt"
sum_file = "data/test_abstract.txt"
vocab_file = "data/vocab"
checkpoint_dir = "./save/baseline/checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "baseline")
vocab = data_util.Vocab("data/vocab", max_vocab_size)
docs = data_util.load_test_data(doc_file, vocab, max_num_tokens)
summary_file = "result/summaries.txt"
with tf.Graph().as_default():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.9
    sess = tf.Session()
    log_writer = tf.summary.FileWriter(checkpoint_dir, graph=sess.graph)
    model = DenseQuasiGRU(vocab_size=max_vocab_size,
                          embedding_size=embedding_size,
                          num_layers=num_layers,
                          state_size=state_size,
                          decoder_vocab_size=max_vocab_size,
                          filter_width=filter_width,
                          zoneout=zoneout,
                          attention_hidden_size=attention_hidden_size,
Esempio n. 4
0
def decode():
    # Load vocabularies.
    doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt")
    sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt")
    en_dict = data_util.load_dict(FLAGS.data_dir + "/en_dict.txt")
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
    data, en_data = data_util.load_test_data(
        FLAGS.test_file, doc_dict, FLAGS.data_dir + "/test.entity.txt",
        en_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, True, None, None, None)

        result = []
        for idx, token_ids in enumerate(data):
            en_ids = en_data[idx]
            if len(en_ids) == 0:
                en_ids = [data_util.ID_PAD]


#            token_ids, en_ids = d
#print(idx)
#print(token_ids)

# Get a 1-element batch to feed the sentence to the model.
            shiva = model.get_batch(
                {
                    0:
                    [(token_ids, [data_util.ID_GO, data_util.ID_EOS],
                      [data_util.ID_PAD, data_util.ID_PAD, data_util.ID_PAD] +
                      en_ids +
                      [data_util.ID_PAD, data_util.ID_PAD, data_util.ID_PAD])]
                }, 0)
            #print(shiva)
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, entity_inputs, entity_len = shiva
            K = min(FLAGS.K, np.amax(entity_len) - 6)
            #print("K", K)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                loss, outputs, att, t = model.step(sess, encoder_inputs,
                                                   decoder_inputs,
                                                   entity_inputs, encoder_len,
                                                   decoder_len, entity_len, K,
                                                   True)

                #outputs = [np.argmax(item) for item in outputs[0]]
            else:
                outputs = model.step_beam(sess,
                                          encoder_inputs,
                                          encoder_len,
                                          entity_inputs,
                                          entity_len,
                                          K,
                                          geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            #print(outputs)
            f2 = open(FLAGS.test_output + '.disambig', 'a')
            f2.write(' '.join(
                str(y) + ":" + str(x.mean())
                for x, y in zip(t[0], entity_inputs[0][3:])) + '\n')
            f2.close()
            f2 = open(FLAGS.test_output + '.attention', 'a')
            f2.write(' '.join(
                str(y) + ":" + str(x)
                for x, y in zip(att[0], entity_inputs[0][3:])) + '\n')
            f2.close()
            outputs = list(outputs[0])
            if data_util.ID_EOS in output:
                outputs = outputs[:outputs.index(data_util.ID_EOS)]
            #outputs = list(outputs)
            gen_sum = " ".join(data_util.sen_map2tok(
                outputs, sum_dict[1]))  #sum_dict[1])) #lvt_str
            gen_sum = data_util.sen_postprocess(gen_sum)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))

        with open(FLAGS.test_output, "w") as f:
            for item in result:
                print(item, file=f)
Esempio n. 5
0
def main(_):
    #1.load test data
    vocab_cn, vocab_en = load_vocab_as_dict(FLAGS.vocabulary_cn_path,
                                            FLAGS.vocabulary_en_path)
    flag_data_en_test_processed_path = os.path.exists(
        FLAGS.data_en_test_processed_path)
    print("processed of english source file exists or not:",
          flag_data_en_test_processed_path)
    if not flag_data_en_test_processed_path:
        preprocess_english_file(FLAGS.data_en_test_path,
                                FLAGS.data_en_test_processed_path)
    test = load_test_data(FLAGS.data_en_test_processed_path, vocab_en,
                          FLAGS.decoder_sent_length)
    print("test[0:10]:", test[0:10])
    test = pad_sequences(test, maxlen=FLAGS.sequence_length,
                         value=0.)  # padding to max length
    sequence_length_batch = [FLAGS.sequence_length] * FLAGS.batch_size

    #2.create session,model,feed data to make a prediction
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = seq2seq_attention_model(
            len(vocab_cn),
            FLAGS.learning_rate,
            FLAGS.batch_size,
            FLAGS.decay_steps,
            FLAGS.decay_rate,
            FLAGS.sequence_length,
            len(vocab_en),
            FLAGS.embed_size,
            FLAGS.hidden_size,
            sequence_length_batch,
            FLAGS.is_training,
            decoder_sent_length=FLAGS.decoder_sent_length,
            l2_lambda=FLAGS.l2_lambda,
            use_beam_search=FLAGS.use_beam_search)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        #feed data, to get logits
        number_of_test_data = len(test)
        print("number_of_test_data:", number_of_test_data)
        index = 0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a',
                                            'utf8')
        decoder_input = np.array([[vocab_cn[_GO]] + [vocab_cn[_PAD]] *
                                  (FLAGS.decoder_sent_length - 1)] *
                                 FLAGS.batch_size)
        print("decoder_input:", decoder_input.shape)
        decoder_input = np.reshape(decoder_input,
                                   [-1, FLAGS.decoder_sent_length])
        print("decoder_input:", decoder_input.shape)
        vocab_cn_index2word = dict([val, key] for key, val in vocab_cn.items())

        for start, end in zip(
                range(0, number_of_test_data, FLAGS.batch_size),
                range(FLAGS.batch_size, number_of_test_data + 1,
                      FLAGS.batch_size)):
            predictions = sess.run(
                model.
                predictions,  # predictions:[batch_size,decoder_sent_length]
                feed_dict={
                    model.input_x: test[start:end],
                    model.decoder_input: decoder_input,
                    model.dropout_keep_prob: 1
                })  # 'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            output_sentence_list = get_label_using_logits(
                predictions, vocab_cn_index2word, vocab_cn)
            # 7. write question id and labels to file system.
            for sentence in output_sentence_list:
                predict_target_file_f.write(sentence + "\n")
        predict_target_file_f.close()
Esempio n. 6
0
def decode():
    # Load vocabularies.
    doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt")
    sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt")
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
    docs, data = data_util.load_test_data(FLAGS.test_file, doc_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, True)
        class_model = create_class_model(sess, True)

        result = []
        for idx, token_ids in enumerate(data):

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\
                data_util.get_batch(
                    {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                loss, outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

                outputs = [np.argmax(item) for item in outputs[0]]
            else:
                outputs = model.step_beam(sess,
                                          encoder_inputs,
                                          encoder_len,
                                          geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            if data_util.ID_EOS in outputs:
                outputs = outputs[:outputs.index(data_util.ID_EOS)]
            gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1]))
            gen_sum = data_util.sen_postprocess(gen_sum)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))

        #Get Encoder outputs
        batchidx = 0
        final_inputs = []
        final_outputs = []
        final_len = []
        while batchidx + FLAGS.batch_size <= len(data):
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\
            data_util.get_batch(
                {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0)

            _, _, enc_outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

            enc_outputs = data_util.add_pad_for_hidden(enc_outputs,
                                                       _buckets[0][0])

            final_inputs.append(enc_outputs)
            final_outputs.append(class_output)
            final_len.append(class_len)

            batchidx += FLAGS.batch_size

        final_inputs = np.asarray(final_inputs)
        final_inputs = np.concatenate(final_inputs, 0)
        final_outputs = np.asarray(final_outputs)
        final_outputs = np.concatenate(final_outputs, 0)
        final_len = np.asarray(final_len)
        final_len = np.concatenate(final_len, 0)
        print(final_inputs.shape, final_outputs.shape, final_len.shape)

        #Hidden classifier
        step_loss, output = class_model.step(sess, final_inputs[:],
                                             final_outputs[:], final_len[:],
                                             True)

        clipped = np.array(output > 0.5, dtype=np.int)
        #label = data_util.hidden_label_gen(FLAGS.test_file, "data/test.1981.msg.txt")
        #make confusion matrix to get precision
        #tn, fp, fn, tp = confusion_matrix(label.flatten(), clipped.flatten()).ravel()
        #print("Test precision : ", tp/(tp+fp))

        with open(FLAGS.test_output, "w") as f:
            for idx, item in enumerate(result):
                print(item, file=f)
                for j in range(len(docs[idx])):
                    if clipped[idx][j] == 1:
                        print("Recommended identifier: " + docs[idx][j] + " ",
                              file=f)
                print("\n", file=f)
Esempio n. 7
0
def train():
    logging.info("Preparing summarization data.")
    docid, sumid, doc_dict, sum_dict, hidden_label= \
        data_util.load_data(
            FLAGS.data_dir + "/train.48615.diff",
            FLAGS.data_dir + "/train.48615.msg",
            FLAGS.data_dir + "/doc_dict.txt",
            FLAGS.data_dir + "/sum_dict.txt",
            FLAGS.doc_vocab_size, FLAGS.sum_vocab_size)

    val_docid, val_sumid, val_hidd_label = \
        data_util.load_valid_data(
            FLAGS.data_dir + "/valid.3000.diff",
            FLAGS.data_dir + "/valid.3000.msg",
            doc_dict, sum_dict)

    with tf.Session() as sess:
        # Create model.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        train_writer = tf.summary.FileWriter(FLAGS.tfboard, sess.graph)
        model = create_model(sess, False)

        # Read data into buckets and compute their sizes.
        logging.info("Create buckets.")
        dev_set = create_bucket(val_docid, val_sumid, val_hidd_label)
        train_set = create_bucket(docid, sumid, hidden_label)

        train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))
        ]

        for (s_size, t_size, _), nsample in zip(_buckets, train_bucket_sizes):
            logging.info("Train set bucket ({}, {}) has {} samples.".format(
                s_size, t_size, nsample))

        # This is the training loop.
        step_time, loss = 0.0, 0.0

        current_step = sess.run(model.global_step)

        while current_step < FLAGS.max_iter:
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \
                data_util.get_batch(train_set, _buckets, bucket_id, FLAGS.batch_size, False,0)
            step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                      encoder_len, decoder_len, False,
                                      train_writer)

            step_time += (time.time() - start_time) / \
                FLAGS.steps_per_validation
            loss += step_loss * FLAGS.batch_size / np.sum(decoder_len) \
                / FLAGS.steps_per_validation
            current_step += 1

            # Once in a while, we save checkpoint.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)

            # Once in a while, we print statistics and run evals.
            if current_step % FLAGS.steps_per_validation == 0:
                # Print statistics for the previous epoch.
                perplexity = np.exp(float(loss))
                logging.info("global step %d step-time %.2f ppl %.2f" %
                             (model.global_step.eval(), step_time, perplexity))

                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in range(len(_buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        logging.info("  eval: empty bucket %d" % (bucket_id))
                        continue
                    encoder_inputs, decoder_inputs,  encoder_len, decoder_len, class_output, class_len  =\
                        data_util.get_batch(dev_set, _buckets, bucket_id, FLAGS.batch_size, False, 0)
                    #cl_eval_loss, _ = class_model.step(sess, class_input, class_output, class_len, True)
                    eval_loss, _, _ = model.step(sess, encoder_inputs,
                                                 decoder_inputs, encoder_len,
                                                 decoder_len, True)
                    eval_loss = eval_loss * FLAGS.batch_size \
                        / np.sum(decoder_len)
                    eval_ppx = np.exp(float(eval_loss))
                    logging.info("  eval: bucket %d ppl %.2f" %
                                 (bucket_id, eval_ppx))
                sys.stdout.flush()

        #Get Encoder outputs
        batchidx = 0
        final_inputs = []
        final_outputs = []
        final_len = []
        while batchidx + FLAGS.batch_size <= train_bucket_sizes[0]:
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \
                data_util.get_batch(train_set, _buckets, bucket_id, FLAGS.batch_size, True, batchidx)

            _, _, enc_outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

            enc_outputs = data_util.add_pad_for_hidden(enc_outputs,
                                                       _buckets[0][0])

            final_inputs.append(enc_outputs)
            final_outputs.append(class_output)
            final_len.append(class_len)

            batchidx += FLAGS.batch_size

        final_inputs = np.asarray(final_inputs)
        final_inputs = np.concatenate(final_inputs, 0)
        final_outputs = np.asarray(final_outputs)
        final_outputs = np.concatenate(final_outputs, 0)
        final_len = np.asarray(final_len)
        final_len = np.concatenate(final_len, 0)
        print(final_inputs.shape, final_outputs.shape, final_len.shape)

        #Hidden classifier
        class_model = create_class_model(sess, False)
        classification_curr_step = sess.run(class_model.global_step)
        i = 0
        while classification_curr_step <= FLAGS.class_max_iter:

            _, step_loss, output = class_model.step(sess,
                                                    final_inputs[i:(i + 160)],
                                                    final_outputs[i:(i + 160)],
                                                    final_len[i:(i + 160)],
                                                    False)
            classification_curr_step += 1

            clipped = np.array(output > 0.5, dtype=np.int)
            #print("i", i)
            #print("clfcurrstep",classification_curr_step)
            #print("clipped", clipped.flatten())
            #print("final_outputs", final_outputs[i:(i+160)].flatten())
            tn, fp, fn, tp = confusion_matrix(
                final_outputs[i:(i + 160)].flatten(),
                clipped.flatten()).ravel()
            if (classification_curr_step % 40 == 0):
                print("Train Precision", tp / (tp + fp + 0.1))
                print("Train Accuracy", (tp + tn) / (tp + fp + tn + fn))
            if (i + 160 == len(final_len)):
                i = 0
            else:
                i += 160

            # Once in a while, we save checkpoint.
            if classification_curr_step % FLAGS.steps_per_checkpoint == 0:
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.class_train_dir,
                                               "class_model.ckpt")
                class_model.saver.save(sess,
                                       checkpoint_path,
                                       global_step=class_model.global_step)

        print("test_file", FLAGS.test_file)
        docs, data = data_util.load_test_data(FLAGS.test_file, doc_dict)

        #test

        # Create model and load parameters.
        '''
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))

        result = []
        for idx, token_ids in enumerate(data):

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\
                data_util.get_batch(
                    {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                loss, outputs = model.step(sess,
                    encoder_inputs, decoder_inputs,
                    encoder_len, decoder_len, False)

                outputs = [np.argmax(item) for item in outputs[0]]
            else:
                outputs = model.step_beam(
                    sess, encoder_inputs, encoder_len, geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            if data_util.ID_EOS in outputs:
                outputs = outputs[:outputs.index(data_util.ID_EOS)]
            gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1]))
            gen_sum = data_util.sen_postprocess(gen_sum)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))
        '''

        #Get Encoder outputs
        docid, sumid, doc_dict, sum_dict, hidden_label= \
        data_util.load_data(
            FLAGS.data_dir + "/test.1981.diff.txt",
            FLAGS.data_dir + "/test.1981.msg.txt",
            FLAGS.data_dir + "/doc_dict.txt",
            FLAGS.data_dir + "/sum_dict.txt",
            FLAGS.doc_vocab_size, FLAGS.sum_vocab_size)
        test_set = create_bucket(docid, sumid, hidden_label)
        test_bucket_sizes = [len(test_set[b]) for b in range(len(_buckets))]
        test_total_size = float(sum(test_bucket_sizes))
        test_buckets_scale = [
            sum(test_bucket_sizes[:i + 1]) / test_total_size
            for i in range(len(test_bucket_sizes))
        ]
        batchidx = 0
        final_inputs = []
        final_outputs = []
        final_len = []
        #data.shape == (1, 158, 3) so I changed FLAGS.batch_size
        FLAGS.batch_size = 158
        while batchidx + FLAGS.batch_size <= len(data):
            #bucket_id = (i for i in range(len(test_buckets_scale))
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \
                data_util.get_batch(test_set, _buckets, bucket_id, FLAGS.batch_size, True, batchidx)

            _, _, enc_outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

            enc_outputs = data_util.add_pad_for_hidden(enc_outputs,
                                                       _buckets[0][0])

            final_inputs.append(enc_outputs)
            final_outputs.append(class_output)
            final_len.append(class_len)

            batchidx += 1

        final_inputs = np.asarray(final_inputs)
        final_inputs = np.concatenate(final_inputs, 0)
        final_outputs = np.asarray(final_outputs)
        final_outputs = np.concatenate(final_outputs, 0)
        final_len = np.asarray(final_len)
        final_len = np.concatenate(final_len, 0)
        print(final_inputs.shape, final_outputs.shape, final_len.shape)

        #Hidden classifier
        step_loss, output = class_model.step(sess, final_inputs[:],
                                             final_outputs[:], final_len[:],
                                             True)

        clipped = np.array(output > 0.5, dtype=np.int)
        tn, fp, fn, tp = confusion_matrix(final_outputs[:].flatten(),
                                          clipped.flatten()).ravel()
        #with open('data/test.1981.msg.txt')as reader:
        #    testmsg=[]
        #    for i in range(1981):
        #        testmsg.append(reader.readline())
        #sums  = list(map(lambda x: x.split(), testmsg))
        #labels = data_util.hidden_label_gen(FLAGS.test_file, sums)
        #tn, fp, fn, tp = confusion_matrix(labels.flatten(), clipped.flatten())
        print("Test Precision : ", tp / (tp + fp + 0.1))
        print("Test Accuracy", (tp + tn) / (tp + fp + tn + fn))
        with open(FLAGS.test_output, "w") as f:
            for idx in range(1981):
                for j in range(len(docs[idx])):
                    if clipped[idx][j] == 1:
                        print("Recommended identifier: " + docs[idx][j] + " ",
                              file=f)
                print("\n", file=f)
Esempio n. 8
0
def ATL_DGP():

    num_inducing_points = 10  # Number of inducing points
    num_hidden_units_source = 2  # Number of hidden units per class for the source task
    num_hidden_units_target = 2  # Number of hidden units per class for the target task
    max_iteration_count = 1  # Maximum number of iterations allowed
    learning_rate_start = 0.001  # Starting learning rate
    inducing_kernel = RBFKernel(np.sqrt(num_hidden_units_source))

    Xtrain_source, ytrain_source, Xtrain_target, ytrain_target = load_train_data(
    )

    Xtrain_source = Xtrain_source[:1000, :]
    ytrain_source = ytrain_source[:1000]
    Xtrain_target = Xtrain_target[:1000, :]
    ytrain_target = ytrain_target[:1000]

    print 'Xtrain_source.shape:', Xtrain_source.shape
    print 'ytrain_source.shape:', ytrain_source.shape

    Nsrc = Xtrain_source.shape[0]

    print 'Xtrain_target.shape:', Xtrain_target.shape
    print 'ytrain_target.shape:', ytrain_target.shape

    # Concatenate source and target data sets
    Data = np.concatenate((Xtrain_source, Xtrain_target))
    labels = np.concatenate((ytrain_source, ytrain_target))

    print 'Data.shape:', Data.shape
    print 'labels.shape:', labels.shape

    # Construct the source-target info map.
    # 0: data point is on the source task
    # 1: data point is on the target task
    source_target_info = np.ones([Data.shape[0], 1])
    source_target_info[0:Nsrc] = 0

    # Construct kernel lists
    kernels_source = list()
    for rr in range(num_hidden_units_source):
        length_scale = Data.shape[1]
        kernel = RBFKernel(length_scale)
        kernels_source.append(kernel)

    kernels_target = list()
    for rr in range(num_hidden_units_target):
        length_scale = Data.shape[1]
        kernel = RBFKernel(length_scale)
        kernels_target.append(kernel)


#  Comment in the lines below to try out the symmetric classifier
#    common_dimensions=2
#    model=DSGPSymmetricTransferClassifier(inducing_kernel,kernels_source, kernels_target, 2, \
#                                             num_inducing=num_inducing_points, \
#                                             max_iter=iter_cnt, \
#                                             learning_rate_start=learning_rate_start)

# Create the class object for the asymmetric classifier
    model = DSGPAsymmetricTransferClassifier(inducing_kernel, kernels_source, kernels_target, num_inducing_points, \
            max_iteration_count, learning_rate_start=learning_rate_start)

    print 'model training ...'

    # Train the model
    model.train(Data, labels, source_target_info)

    print 'model train done.'

    Xtest_target, ytest_target = load_test_data()

    print 'Xtest_target.shape:', Xtest_target.shape
    print 'ytest_target.shape:', ytest_target.shape

    print 'predicting ...'

    # Predict on test data and report accuracy
    predictions = model.predict(Xtest_target)
    print "Accuracy: %.2f %% " % (
        np.mean(predictions.predictions == ytest_target) * 100)

    calculate_metrics(ytest_target[0], predictions.predictions)