def create_tf_record(source_files, vocab_files, out_dir, mode, total_shards):
    input_encoder = tokenizer.Subtokenizer(vocab_files[0])
    target_encoder = tokenizer.Subtokenizer(vocab_files[1])
    shard_files = [
        shard_filename(out_dir, mode, n + 1, total_shards)
        for n in range(total_shards)
    ]
    writers = [tf.python_io.TFRecordWriter(fname) for fname in shard_files]

    input_file = source_files[0]
    target_file = source_files[1]
    counter = 0
    shard = 0
    for input_line, target_line in zip(iterator_file(input_file),
                                       iterator_file(target_file)):
        counter += 1

        if counter > 0 and counter % 100000 == 0:
            tf.logging.info("\tSaving case %d." % counter)

        example_dict = {
            'inputs': input_encoder.encode(input_line, True),
            'targets': target_encoder.encode(target_line, True)
        }

        example = dict_to_example(example_dict)
        writers[shard].write(example.SerializeToString())
        shard = (shard + 1) % total_shards
 def _init_subtokenizer(self, vocab_list):
     temp_file = tempfile.NamedTemporaryFile(delete=False)
     with tf.gfile.Open(temp_file.name, 'w') as w:
         for subtoken in vocab_list:
             w.write("'%s'" % subtoken)
             w.write("\n")
     return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
Exemple #3
0
def evaluate_and_log_bleu(estimator, bleu_source, bleu_ref, vocab_file):
    """Calculate and record the BLEU score."""
    subtokenizer = tokenizer.Subtokenizer(vocab_file)

    uncased_score, cased_score = translate_and_compute_bleu(
        estimator, subtokenizer, bleu_source, bleu_ref)

    tf.logging.info("Bleu score (uncased):", uncased_score)
    tf.logging.info("Bleu score (cased):", cased_score)
    return uncased_score, cased_score
Exemple #4
0
def main(unused_argv):
    from transformer import transformer_main

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    input_encoder = tokenizer.Subtokenizer(FLAGS.input_vocab_file)
    target_encoder = tokenizer.Subtokenizer(FLAGS.target_vocab_file)

    # Set up estimator and params
    params = transformer_main.PARAMS_MAP[FLAGS.param_set]
    params["beam_size"] = _BEAM_SIZE
    params["alpha"] = _ALPHA
    params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
    params["batch_size"] = _DECODE_BATCH_SIZE
    estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, input_encoder, target_encoder, FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)

        output_file = None
        if FLAGS.file_out is not None:
            output_file = os.path.abspath(FLAGS.file_out)
            tf.logging.info("File output specified: %s" % output_file)

        translate_file(estimator, input_encoder, target_encoder, input_file,
                       output_file)
    totalLineNum = os.popen('wc -l ' + zh_source_file).read().split()[0]
    global start_time
    start_time = time.time()
    for i, line in iterator_file(zh_source_file):
        line = line.replace("\r", "").replace("\n", "")
        percent = i / int(totalLineNum) * 100
        duration = time.time() - start_time
        sys.stdout.write("\r%.2f%% cutting %dth line of %d, %d sec passed. " %
                         (percent, i, int(totalLineNum), duration))
        # print("\rcutting " + str(i) + "th line of :" + line, end='', flush=True)
        sys.stdout.flush()
        zh_subtoken_list.extend(jieba.lcut(line))
    print("cut list done..")

    print("tokenizing zh vocab..")
    targets_tokenizer = tokenizer.Subtokenizer(zh_vocab, zh_subtoken_list)
    print("tokenizing zh vocab done.")

    # data_dir = './train_data'
    data_dir = '/tmp/t2t_datagen/'

    print("create train_tfrecord")

    create_tf_record([en_source_file, zh_source_file], [en_vocab, zh_vocab],
                     data_dir, 'train', 10)

    create_tf_record([
        os.path.join(source_dir, _DEV_DATA['inputs']),
        os.path.join(source_dir, _DEV_DATA['targets'])
    ], [en_vocab, zh_vocab], data_dir, 'dev', 1)