Esempio n. 1
0
def test_read_train_dev_test():
    data_dir = os.path.join(root_dir, "data/ptb")
    train_path = os.path.join(data_dir, "train")
    dev_path = os.path.join(data_dir, "valid")
    test_path = os.path.join(data_dir, "test")
    cache_dir = os.path.join(root_dir, "data/ptb/cache")
    vocab_size = 20000
    if not os.path.exists(cache_dir):
        os.mkdir(cache_dir)
    train_data_bucket, dev_data_bucket, _buckets, vocab_path = data_util.read_train_dev(
        cache_dir, train_path, dev_path, vocab_size, 100, 10)
    test_data_bucket, _buckets_test = data_util.read_test(
        cache_dir, test_path, vocab_path, vocab_size, 100, 10)

    def print_bucket_data(data):
        l = [len(x) for x in data]
        print l

    print "_buckets: {}\n".format(_buckets)
    print_bucket_data(train_data_bucket)
    print_bucket_data(dev_data_bucket)
    print "_buckets_test: {}\n".format(_buckets_test)
    print_bucket_data(test_data_bucket)
Esempio n. 2
0
def dump_lstm():
    # dump the hidden states to some where
    mylog_section("READ DATA")
    test_data_bucket, _buckets, test_data_order = read_test(
        FLAGS.data_cache_dir, FLAGS.test_path,
        get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket)
    vocab_path = get_vocab_path(FLAGS.data_cache_dir)
    real_vocab_size = get_real_vocab_size(vocab_path)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size = real_vocab_size

    test_bucket_sizes = [
        len(test_data_bucket[b]) for b in range(len(_buckets))
    ]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog_section("REPORT")

    mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size))
    mylog("_buckets:{}".format(FLAGS._buckets))
    mylog("DUMP_LSTM:")
    mylog("total: {}".format(test_total_size))
    mylog("buckets: {}".format(test_bucket_sizes))

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth
    with tf.Session(config=config) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog_section("MODEL")

        mylog("Creating Model")
        model = create_model(sess, run_options, run_metadata)

        mylog("Init tensors to dump")
        model.init_dump_states()

        # dump_graph('graph.txt')
        mylog_section("All Variables")
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))

        start_id = 0
        n_steps = 0
        batch_size = FLAGS.batch_size

        mylog_section("Data Iterators")

        dite = DataIterator(model,
                            test_data_bucket,
                            len(_buckets),
                            batch_size,
                            None,
                            data_order=test_data_order)
        ite = dite.next_original()

        fdump = open(FLAGS.dump_file, 'wb')

        mylog_section("DUMP_LSTM")

        i_sent = 0
        for inputs, outputs, weights, bucket_id in ite:
            # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]]
            # positions: [4]

            mylog("--- decoding {}/{} sent ---".format(i_sent,
                                                       test_total_size))
            i_sent += 1
            # print(inputs)
            # print(outputs)
            # print(weights)
            # print(bucket_id)

            L, states = model.step(sess,
                                   inputs,
                                   outputs,
                                   weights,
                                   bucket_id,
                                   forward_only=True,
                                   dump_lstm=True)

            mylog("LOSS: {}".format(L))

            sw = StateWrapper()
            sw.create(inputs, outputs, weights, states)
            sw.save_to_stream(fdump)

            # do the following convert:
            # inputs: [[pad_id],[1],[2],[pad_id],[pad_id],[pad_id]]
            # positions:[2]

        fdump.close()
Esempio n. 3
0
def force_decode():
    # force_decode it: generate a file which contains every score and the final score;
    mylog_section("READ DATA")
    #读入test数据,test不需要新建立词典,直接调用建立好的词典就可以了。
    test_data_bucket, _buckets, test_data_order = read_test(
        FLAGS.data_cache_dir, FLAGS.test_path,
        get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket)
    vocab_path = get_vocab_path(FLAGS.data_cache_dir)
    real_vocab_size = get_real_vocab_size(vocab_path)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size = real_vocab_size

    test_bucket_sizes = [
        len(test_data_bucket[b]) for b in range(len(_buckets))
    ]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog_section("REPORT")
    mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size))
    mylog("_buckets:{}".format(FLAGS._buckets))
    mylog("FORCE_DECODE:")
    mylog("total: {}".format(test_total_size))
    mylog("bucket_sizes: {}".format(test_bucket_sizes))

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth

    mylog_section("IN TENSORFLOW")
    with tf.Session(config=config) as sess:
        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model")
        model = create_model(sess, run_options, run_metadata)

        mylog_section("All Variables")
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))
        batch_size = FLAGS.batch_size
        mylog_section("Data Iterators")
        dite = DataIterator(model,
                            test_data_bucket,
                            len(_buckets),
                            batch_size,
                            None,
                            data_order=test_data_order)
        ite = dite.next_original()

        fdump = open(FLAGS.score_file, 'w')
        i_sent = 0

        mylog_section("FORCE_DECODING")
        for inputs, outputs, weights, bucket_id in ite:
            # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]]
            # positions: [4]
            mylog("--- decoding {}/{} sent ---".format(i_sent,
                                                       test_total_size))
            i_sent += 1
            L = model.step(sess,
                           inputs,
                           outputs,
                           weights,
                           bucket_id,
                           forward_only=True,
                           dump_lstm=False)
            mylog("LOSS: {}".format(L))
            fdump.write("{}\n".format(L))
        fdump.close()
Esempio n. 4
0
def beam_decode():
    # not yet tested:
    # known issues:
    #   should use next_original
    mylog("Reading Data...")
    test_data_bucket, _buckets, test_data_order = read_test(
        FLAGS.data_cache_dir, FLAGS.test_path,
        get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket)
    vocab_path = get_vocab_path(FLAGS.data_cache_dir)
    real_vocab_size = get_real_vocab_size(vocab_path)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size = real_vocab_size

    test_bucket_sizes = [
        len(test_data_bucket[b]) for b in range(len(_buckets))
    ]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size))
    mylog("_buckets:{}".format(FLAGS._buckets))
    mylog("BEAM_DECODE:")
    mylog("total: {}".format(test_total_size))
    mylog("buckets: {}".format(test_bucket_sizes))

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth

    with tf.Session(config=config) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model")
        model = create_model(sess, run_options, run_metadata)
        mylog("before init_beam_decoder()")
        show_all_variables()
        model.init_beam_decoder(beam_size=FLAGS.beam_size,
                                max_steps=FLAGS.beam_step)
        model.init_beam_variables(sess)
        mylog("after init_beam_decoder()")
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))

        start_id = 0
        n_steps = 0
        batch_size = FLAGS.batch_size

        dite = DataIterator(model, test_data_bucket, len(_buckets), batch_size,
                            None)
        ite = dite.next_sequence(stop=True, test=True)

        i_sent = 0
        for inputs, positions, valids, bucket_id in ite:
            # user : [0]
            # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]]
            # positions: [4]

            print("--- decoding {}/{} sent ---".format(i_sent, n_total_user))
            i_sent += 1

            # do the following convert:
            # inputs: [[pad_id],[1],[2],[pad_id],[pad_id],[pad_id]]
            # positions:[2]
            PAD_ID = 0
            last_history = inputs[positions[0]]
            inputs_beam = [last_history * FLAGS.beam_size]
            inputs[positions[0]] = list([PAD_ID] * FLAGS.beam_size)
            inputs[positions[0] - 1] = list([PAD_ID] * FLAGS.beam_size)
            positions[0] = positions[0] - 2 if positions[0] >= 2 else 0
            scores = [0.0] * FLAGS.beam_size
            sentences = [[] for x in range(FLAGS.beam_size)]
            beam_parent = range(FLAGS.beam_size)

            for i in range(FLAGS.beam_step):
                if i == 0:
                    top_value, top_index = model.beam_step(
                        sess,
                        index=i,
                        word_inputs_history=inputs,
                        sequence_length=positions,
                        word_inputs_beam=inputs_beam)
                else:
                    top_value, top_index = model.beam_step(
                        sess,
                        index=i,
                        word_inputs_beam=inputs_beam,
                        beam_parent=beam_parent)

                # expand
                global_queue = []

                if i == 0:
                    nrow = 1
                else:
                    nrow = top_index[0].shape[0]

                for row in range(nrow):
                    for col in range(top_index[0].shape[1]):
                        score = scores[row] + np.log(top_value[0][row, col])
                        word_index = top_index[0][row, col]
                        beam_index = row

                        if FLAGS.no_repeat:
                            if not word_index in sentences[beam_index]:
                                global_queue.append(
                                    (score, beam_index, word_index))
                        else:
                            global_queue.append(
                                (score, beam_index, word_index))

                global_queue = sorted(global_queue, key=lambda x: -x[0])

                inputs_beam = []
                beam_parent = []
                scores = []
                temp_sentences = []

                if FLAGS.print_beam:
                    print("--------- Step {} --------".format(i))

                for j, (score, beam_index, word_index) in enumerate(
                        global_queue[:FLAGS.beam_size]):
                    if FLAGS.print_beam:
                        print("Beam:{} Father:{} word:{} score:{}".format(
                            j, beam_index, word_index, score))
                    beam_parent.append(beam_index)
                    inputs_beam.append(word_index)
                    scores.append(score)
                    temp_sentences.append(sentences[beam_index] + [word_index])

                inputs_beam = [inputs_beam]
                sentences = temp_sentences

            if FLAGS.print_beam:
                print(sentences)