Example #1
0
def force_decode():
    # force_decode it: generate a file which contains every score and the final score;
    mylog_section("READ DATA")
    #读入test数据,test不需要新建立词典,直接调用建立好的词典就可以了。
    test_data_bucket, _buckets, test_data_order = read_test(
        FLAGS.data_cache_dir, FLAGS.test_path,
        get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket)
    vocab_path = get_vocab_path(FLAGS.data_cache_dir)
    real_vocab_size = get_real_vocab_size(vocab_path)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size = real_vocab_size

    test_bucket_sizes = [
        len(test_data_bucket[b]) for b in range(len(_buckets))
    ]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog_section("REPORT")
    mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size))
    mylog("_buckets:{}".format(FLAGS._buckets))
    mylog("FORCE_DECODE:")
    mylog("total: {}".format(test_total_size))
    mylog("bucket_sizes: {}".format(test_bucket_sizes))

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth

    mylog_section("IN TENSORFLOW")
    with tf.Session(config=config) as sess:
        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model")
        model = create_model(sess, run_options, run_metadata)

        mylog_section("All Variables")
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))
        batch_size = FLAGS.batch_size
        mylog_section("Data Iterators")
        dite = DataIterator(model,
                            test_data_bucket,
                            len(_buckets),
                            batch_size,
                            None,
                            data_order=test_data_order)
        ite = dite.next_original()

        fdump = open(FLAGS.score_file, 'w')
        i_sent = 0

        mylog_section("FORCE_DECODING")
        for inputs, outputs, weights, bucket_id in ite:
            # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]]
            # positions: [4]
            mylog("--- decoding {}/{} sent ---".format(i_sent,
                                                       test_total_size))
            i_sent += 1
            L = model.step(sess,
                           inputs,
                           outputs,
                           weights,
                           bucket_id,
                           forward_only=True,
                           dump_lstm=False)
            mylog("LOSS: {}".format(L))
            fdump.write("{}\n".format(L))
        fdump.close()
Example #2
0
def dump_lstm():
    # dump the hidden states to some where
    mylog_section("READ DATA")
    test_data_bucket, _buckets, test_data_order = read_test(
        FLAGS.data_cache_dir, FLAGS.test_path,
        get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket)
    vocab_path = get_vocab_path(FLAGS.data_cache_dir)
    real_vocab_size = get_real_vocab_size(vocab_path)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size = real_vocab_size

    test_bucket_sizes = [
        len(test_data_bucket[b]) for b in range(len(_buckets))
    ]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog_section("REPORT")

    mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size))
    mylog("_buckets:{}".format(FLAGS._buckets))
    mylog("DUMP_LSTM:")
    mylog("total: {}".format(test_total_size))
    mylog("buckets: {}".format(test_bucket_sizes))

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth
    with tf.Session(config=config) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog_section("MODEL")

        mylog("Creating Model")
        model = create_model(sess, run_options, run_metadata)

        mylog("Init tensors to dump")
        model.init_dump_states()

        # dump_graph('graph.txt')
        mylog_section("All Variables")
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))

        start_id = 0
        n_steps = 0
        batch_size = FLAGS.batch_size

        mylog_section("Data Iterators")

        dite = DataIterator(model,
                            test_data_bucket,
                            len(_buckets),
                            batch_size,
                            None,
                            data_order=test_data_order)
        ite = dite.next_original()

        fdump = open(FLAGS.dump_file, 'wb')

        mylog_section("DUMP_LSTM")

        i_sent = 0
        for inputs, outputs, weights, bucket_id in ite:
            # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]]
            # positions: [4]

            mylog("--- decoding {}/{} sent ---".format(i_sent,
                                                       test_total_size))
            i_sent += 1
            # print(inputs)
            # print(outputs)
            # print(weights)
            # print(bucket_id)

            L, states = model.step(sess,
                                   inputs,
                                   outputs,
                                   weights,
                                   bucket_id,
                                   forward_only=True,
                                   dump_lstm=True)

            mylog("LOSS: {}".format(L))

            sw = StateWrapper()
            sw.create(inputs, outputs, weights, states)
            sw.save_to_stream(fdump)

            # do the following convert:
            # inputs: [[pad_id],[1],[2],[pad_id],[pad_id],[pad_id]]
            # positions:[2]

        fdump.close()
Example #3
0
def beam_decode():

    mylog("Reading Data...")

    from_test = None

    from_vocab_path, to_vocab_path, real_vocab_size_from, real_vocab_size_to = data_utils.get_vocab_info(
        FLAGS.data_cache_dir)

    FLAGS._buckets = _buckets
    FLAGS._beam_buckets = _beam_buckets
    FLAGS.real_vocab_size_from = real_vocab_size_from
    FLAGS.real_vocab_size_to = real_vocab_size_to

    from_test = data_utils.prepare_test_data(FLAGS.data_cache_dir,
                                             FLAGS.test_path_from,
                                             from_vocab_path)

    test_data_bucket, test_data_order = read_data_test(from_test)

    test_bucket_sizes = [
        len(test_data_bucket[b]) for b in xrange(len(_beam_buckets))
    ]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog("from_vocab_size: {}".format(FLAGS.from_vocab_size))
    mylog("to_vocab_size: {}".format(FLAGS.to_vocab_size))
    mylog("_beam_buckets: {}".format(FLAGS._beam_buckets))
    mylog("BEAM_DECODE:")
    mylog("total: {}".format(test_total_size))
    mylog("buckets: {}".format(test_bucket_sizes))

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth

    with tf.Session(config=config) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model")
        model = create_model(sess, run_options, run_metadata)
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))

        start_id = 0
        n_steps = 0
        batch_size = FLAGS.batch_size

        dite = DataIterator(model,
                            test_data_bucket,
                            len(_beam_buckets),
                            batch_size,
                            None,
                            data_order=test_data_order)
        ite = dite.next_original()

        i_sent = 0

        targets = []

        for source_inputs, bucket_id, length in ite:

            print("--- decoding {}/{} sent ---".format(i_sent,
                                                       test_total_size))
            i_sent += 1

            results = []  # (sentence,score)
            scores = [0.0] * FLAGS.beam_size
            sentences = [[] for x in xrange(FLAGS.beam_size)]
            beam_parent = range(FLAGS.beam_size)

            target_inputs = [data_utils.GO_ID] * FLAGS.beam_size
            min_target_length = int(length * FLAGS.min_ratio) + 1
            max_target_length = int(
                length * FLAGS.max_ratio) + 1  # include EOS
            for i in xrange(max_target_length):
                if i == 0:
                    top_value, top_index, eos_value = model.beam_step(
                        sess,
                        bucket_id,
                        index=i,
                        sources=source_inputs,
                        target_inputs=target_inputs)
                else:

                    top_value, top_index, eos_value = model.beam_step(
                        sess,
                        bucket_id,
                        index=i,
                        target_inputs=target_inputs,
                        beam_parent=beam_parent)

                # top_value = [array[batch_size, batch_size]]
                # top_index = [array[batch_size, batch_size]]
                # eos_value = [array[batch_size, 1] ]

                # expand
                global_queue = []

                if i == 0:
                    nrow = 1
                else:
                    nrow = FLAGS.beam_size

                if i == max_target_length - 1:  # last_step
                    for row in xrange(nrow):

                        score = scores[row] + np.log(eos_value[0][row, 0])
                        word_index = data_utils.EOS_ID
                        beam_index = row
                        global_queue.append((score, beam_index, word_index))

                else:
                    for row in xrange(nrow):
                        for col in xrange(top_index[0].shape[1]):
                            score = scores[row] + np.log(top_value[0][row,
                                                                      col])
                            word_index = top_index[0][row, col]
                            beam_index = row

                            global_queue.append(
                                (score, beam_index, word_index))

                global_queue = sorted(global_queue, key=lambda x: -x[0])

                if FLAGS.print_beam:
                    print("--------- Step {} --------".format(i))

                target_inputs = []
                beam_parent = []
                scores = []
                temp_sentences = []

                for j, (score, beam_index,
                        word_index) in enumerate(global_queue):
                    if word_index == data_utils.EOS_ID:
                        if len(sentences[beam_index]) + 1 < min_target_length:
                            continue

                        results.append(
                            (sentences[beam_index] + [word_index], score))
                        if FLAGS.print_beam:
                            print("*Beam:{} Father:{} word:{} score:{}".format(
                                j, beam_index, word_index, score))
                        continue

                    if FLAGS.print_beam:
                        print("Beam:{} Father:{} word:{} score:{}".format(
                            j, beam_index, word_index, score))
                    beam_parent.append(beam_index)

                    target_inputs.append(word_index)
                    scores.append(score)
                    temp_sentences.append(sentences[beam_index] + [word_index])

                    if len(scores) >= FLAGS.beam_size:
                        break

                # can not fill beam_size, just repeat the last one
                while len(scores
                          ) < FLAGS.beam_size and i < max_target_length - 1:
                    beam_parent.append(beam_parent[-1])
                    target_inputs.append(target_inputs[-1])
                    scores.append(scores[-1])
                    temp_sentences.append(temp_sentences[-1])

                sentences = temp_sentences

                # print the 1 best
            results = sorted(results, key=lambda x: -x[1])

            targets.append(results[0][0])

        data_utils.ids_to_tokens(targets, to_vocab_path, FLAGS.decode_output)
Example #4
0
def beam_decode():
    mylog("Reading Data...")
    from_test = None
    from_vocab_path, to_vocab_path, real_vocab_size_from, real_vocab_size_to = data_util.get_vocab_info(
        FLAGS.data_cache_dir)

    FLAGS._buckets = _buckets
    FLAGS._beam_buckets = _beam_buckets
    FLAGS.real_vocab_size_from = real_vocab_size_from
    FLAGS.real_vocab_size_to = real_vocab_size_to

    # 得到test文件转换成ids的地址。
    from_test = data_util.prepare_test_data(FLAGS.data_cache_dir,
                                            FLAGS.test_path_from,
                                            from_vocab_path)

    test_data_bucket, test_data_order = read_data_test(from_test)

    test_bucket_sizes = [
        len(test_data_bucket[b]) for b in xrange(len(_beam_buckets))
    ]
    test_total_size = int(sum(test_bucket_sizes))
    # reports
    mylog("from_vocab_size: {}".format(FLAGS.from_vocab_size))
    mylog("to_vocab_size: {}".format(FLAGS.to_vocab_size))
    mylog("_beam_buckets: {}".format(FLAGS._beam_buckets))
    mylog("BEAM_DECODE:")
    mylog("total: {}".format(test_total_size))
    mylog("buckets: {}".format(test_bucket_sizes))

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth
    with tf.Session(config=config) as sess:
        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model")
        model = create_model(sess, run_options, run_metadata)
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))
        batch_size = FLAGS.batch_size

        dite = DataIterator(model,
                            test_data_bucket,
                            len(_beam_buckets),
                            batch_size,
                            None,
                            data_order=test_data_order)
        ite = dite.next_original()
        i_sent = 0
        targets = []

        for source_inputs, bucket_id, length in ite:
            print("--- decoding {}/{} sent ---".format(i_sent,
                                                       test_total_size))
            i_sent += 1

            results = []  # (sentence,score)
            scores = [0.0] * FLAGS.beam_size
            sentences = [[] for x in xrange(FLAGS.beam_size)]
            beam_parent = range(FLAGS.beam_size)

            target_inputs = [data_util.GO_ID] * FLAGS.beam_size
            min_target_length = int(length * FLAGS.min_ratio) + 1
            max_target_length = int(
                length * FLAGS.max_ratio) + 1  # include EOS
            for i in xrange(max_target_length):
                if i == 0:
                    top_value, top_index, eos_value = model.beam_step(
                        sess,
                        bucket_id,
                        index=i,
                        sources=source_inputs,
                        target_inputs=target_inputs)
                else:
                    top_value, top_index, eos_value = model.beam_step(
                        sess,
                        bucket_id,
                        index=i,
                        target_inputs=target_inputs,
                        beam_parent=beam_parent)
                # expand
                global_queue = [
                ]  #没预测一个词之前都重新定义,用来记录加入句子以后的分数以及对应的句子,最后根据分数排名选出最佳的句子。
                if i == 0:  #如果是decoder的第一步,则只取第一行作为输出,作为第二次的输入。
                    nrow = 1
                else:
                    nrow = FLAGS.beam_size

                if i == max_target_length - 1:  # last_step
                    for row in xrange(nrow):
                        score = scores[row] + np.log(eos_value[0][row, 0])
                        word_index = data_util.EOS_ID
                        beam_index = row
                        global_queue.append((score, beam_index, word_index))
                else:
                    for row in xrange(
                            nrow
                    ):  # 对每一个parent的子预测结果进行预测,xrange(nrow)就是循环遍历每一个Parent。
                        for col in xrange(
                                top_index[0].shape[1]
                        ):  #对每一个parent 的 top_index的每一个预测结果进行计算。 top_index的每一列就是一个预测结果。
                            score = scores[row] + np.log(
                                top_value[0][
                                    row, col])  #新的分数是原parent的句子的分数*后面生成的单词的分数。
                            word_index = top_index[0][row, col]
                            beam_index = row  #parent

                            global_queue.append(
                                (score, beam_index, word_index))
                global_queue = sorted(global_queue, key=lambda x: -x[0])
                if FLAGS.print_beam:
                    print("--------- Step {} --------".format(i))
                target_inputs = []
                beam_parent = []
                scores = []
                temp_sentences = []
                #对排序好的global_queue取前beam_size个存入target_inputs、beam_parent、scores、temp_sentences中供下一步预测使用。
                for j, (score, beam_index,
                        word_index) in enumerate(global_queue):
                    if word_index == data_util.EOS_ID:
                        if len(sentences[beam_index]) + 1 < min_target_length:
                            continue
                        results.append((sentences[beam_index] + [word_index],
                                        score))  #每预测一个句子,就加入到results中。
                        if FLAGS.print_beam:
                            print("*Beam:{} Father:{} word:{} score:{}".format(
                                j, beam_index, word_index, score))
                        continue
                    if FLAGS.print_beam:
                        print("Beam:{} Father:{} word:{} score:{}".format(
                            j, beam_index, word_index, score))
                    beam_parent.append(beam_index)
                    target_inputs.append(word_index)
                    scores.append(score)
                    temp_sentences.append(sentences[beam_index] + [word_index])
                    if len(scores
                           ) >= FLAGS.beam_size:  #选取前beam_size个结果保存供下次使用。
                        break
                # can not fill beam_size, just repeat the last one,不足beam_size个数据,用最后一个数据填充。
                while len(scores
                          ) < FLAGS.beam_size and i < max_target_length - 1:
                    beam_parent.append(beam_parent[-1])
                    target_inputs.append(target_inputs[-1])
                    scores.append(scores[-1])
                    temp_sentences.append(temp_sentences[-1])
                sentences = temp_sentences
            # print the 1 best
            #将一个source的所有预测的句子排序
            results = sorted(results, key=lambda x: -x[1])
            #选取最好的结果加入到targets中。
            targets.append(results[0][0])
        #对所有的预测的句子转换成word并写入文件中。
        data_util.ids_to_tokens(targets, to_vocab_path, FLAGS.decode_output)