Python Subtokenizerの例、utils.tokenizer.Subtokenizer Pythonの例

コード例 #1

0

ファイルを表示

def main(unused_argv):
    import transformer_rl_main

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    subtokenizer_source = tokenizer.Subtokenizer(
        os.path.join(FLAGS.data_dir,
                     'vocab' + '.bpe.' + str(FLAGS.search) + '.' + FLAGS.fro))
    subtokenizer_target = tokenizer.Subtokenizer(
        os.path.join(FLAGS.data_dir,
                     'vocab' + '.bpe.' + str(FLAGS.search) + '.' + FLAGS.to))

    # Set up estimator and params
    params = transformer_rl_main.PARAMS_MAP[FLAGS.param_set]

    params.beam_size = _BEAM_SIZE
    params.alpha = _ALPHA
    params.extra_decode_length = _EXTRA_DECODE_LENGTH
    params.batch_size = _DECODE_BATCH_SIZE
    fp = open(
        os.path.join(FLAGS.data_dir,
                     'vocab.bpe.' + str(FLAGS.search) + "." + FLAGS.fro), 'r')
    lines = fp.readlines()
    params.source_vocab_size = len(lines)
    fp = open(
        os.path.join(FLAGS.data_dir,
                     'vocab.bpe.' + str(FLAGS.search) + "." + FLAGS.to), 'r')
    lines = fp.readlines()
    params.target_vocab_size = len(lines)

    estimator = tf.estimator.Estimator(
        model_fn=transformer_rl_main.gan_model_fn,
        model_dir=FLAGS.model_dir,
        params=params)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, subtokenizer_source, subtokenizer_target,
                       FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)

        output_file = None
        if FLAGS.file_out is not None:
            output_file = os.path.abspath(FLAGS.file_out)
            tf.logging.info("File output specified: %s" % output_file)

        translate_file(estimator, subtokenizer_source, subtokenizer_target,
                       input_file, output_file)

コード例 #2

0

ファイルを表示

def evaluate_and_log_bleu(estimator, bleu_source, bleu_ref, vocab_file, vocab_file_target):
    """Calculate and record the BLEU score."""
    subtokenizer = tokenizer.Subtokenizer(vocab_file)
    subtokenizer_target = tokenizer.Subtokenizer(vocab_file_target)
    uncased_score, cased_score = translate_and_compute_bleu(
        estimator, subtokenizer, subtokenizer_target, bleu_source, bleu_ref)

    tf.logging.info("Bleu score (uncased): %d", uncased_score)
    tf.logging.info("Bleu score (cased): %d", cased_score)
    return uncased_score, cased_score

コード例 #3

0

ファイルを表示

def evaluate_and_log_bleu(model,
                          params,
                          bleu_source,
                          bleu_ref,
                          vocab_file,
                          distribution_strategy=None):
    """Calculate and record the BLEU score.

  Args:
    model: A Keras model, used to generate the translations.
    params: A dictionary, containing the translation related parameters.
    bleu_source: A file containing source sentences for translation.
    bleu_ref: A file containing the reference for the translated sentences.
    vocab_file: A file containing the vocabulary for translation.
    distribution_strategy: A platform distribution strategy, used for TPU based
      translation.

  Returns:
    uncased_score: A float, the case insensitive BLEU score.
    cased_score: A float, the case sensitive BLEU score.
  """
    subtokenizer = tokenizer.Subtokenizer(vocab_file)

    uncased_score, cased_score = translate_and_compute_bleu(
        model, params, subtokenizer, bleu_source, bleu_ref,
        distribution_strategy)

    logging.info("Bleu score (uncased): %s", uncased_score)
    logging.info("Bleu score (cased): %s", cased_score)
    return uncased_score, cased_score

コード例 #4

0

ファイルを表示

 def _init_subtokenizer(self, vocab_list):
   temp_file = tempfile.NamedTemporaryFile(delete=False)
   with tf.gfile.Open(temp_file.name, 'w') as w:
     for subtoken in vocab_list:
       w.write("'%s'" % subtoken)
       w.write("\n")
   return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])

コード例 #5

0

ファイルを表示

ファイル: transformer_main.py プロジェクト: templeblock/Transformer_Tensorlayer

def evaluate_and_log_bleu(model, bleu_source, bleu_ref, vocab_file):
    """Calculate and record the BLEU score."""
    subtokenizer = tokenizer.Subtokenizer(vocab_file)

    uncased_score, cased_score = translate_and_compute_bleu(
        model, subtokenizer, bleu_source, bleu_ref)

    tf.compat.v1.logging.info("Bleu score (uncased): %s", uncased_score)
    tf.compat.v1.logging.info("Bleu score (cased): %s", cased_score)
    return uncased_score, cased_score

コード例 #6

0

ファイルを表示

ファイル: translate.py プロジェクト: durgeshbhagat/transformer_zhen

def main(unused_argv):
    import transformer_main

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)
    subtokenizer_target = tokenizer.Subtokenizer(FLAGS.vocab_file_en)

    # Set up estimator and params
    params = transformer_main.PARAMS_MAP[FLAGS.param_set]
    params["beam_size"] = _BEAM_SIZE
    params["alpha"] = _ALPHA
    params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
    params["batch_size"] = _DECODE_BATCH_SIZE
    estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, subtokenizer, subtokenizer_target,
                       FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)

        output_file = None
        if FLAGS.file_out is not None:
            output_file = os.path.abspath(FLAGS.file_out)
            tf.logging.info("File output specified: %s" % output_file)

        translate_file(estimator, subtokenizer, input_file, output_file)

コード例 #7

0

ファイルを表示

def main(unused_argv):
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

  if FLAGS.text is None and FLAGS.file is None:
    tf.compat.v1.logging.warn("Nothing to translate. Make sure to call this script using "
                    "flags --text or --file.")
    return

  subtokenizer = tokenizer.Subtokenizer(
      os.path.join(FLAGS.data_dir, FLAGS.vocab_file))

  if FLAGS.params == "base":
    params = model_params.TransformerBaseParams
  elif FLAGS.params == "big":
    params = model_params.TransformerBigParams
  else:
    raise ValueError("Invalid parameter set defined: %s."
                     "Expected 'base' or 'big.'" % FLAGS.params)

  # Set up estimator and params
  params.beam_size = _BEAM_SIZE
  params.alpha = _ALPHA
  params.extra_decode_length = _EXTRA_DECODE_LENGTH
  params.batch_size = FLAGS.batch_size
  params.frozen_graph = FLAGS.input_graph
  # Add inter_op and intra_op parallelism thread
  session_config = tf.compat.v1.ConfigProto(
      inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
      intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
      allow_soft_placement=True)
  run_config = tf.estimator.RunConfig(session_config=session_config)
  estimator = tf.estimator.Estimator(
      model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir, params=params,
      config=run_config)

  if FLAGS.text is not None:
    tf.compat.v1.logging.info("Translating text: %s" % FLAGS.text)
    translate_text(estimator, subtokenizer, FLAGS.text)

  if FLAGS.file is not None:
    input_file = os.path.abspath(FLAGS.file)
    tf.compat.v1.logging.info("Translating file: %s" % input_file)
    if not tf.io.gfile.exists(FLAGS.file):
      raise ValueError("File does not exist: %s" % input_file)

    output_file = None
    if FLAGS.file_out is not None:
      output_file = os.path.abspath(FLAGS.file_out)
      tf.compat.v1.logging.info("File output specified: %s" % output_file)

    translate_file(estimator, subtokenizer, input_file, output_file,
        FLAGS.batch_size, False)

コード例 #8

0

ファイルを表示

def main(unused_argv):
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    subtokenizer = tokenizer.Subtokenizer(
        os.path.join(FLAGS.data_dir, FLAGS.vocab_file))

    if FLAGS.params == "base":
        params = model_params.TransformerBaseParams
    elif FLAGS.params == "big":
        params = model_params.TransformerBigParams
    else:
        raise ValueError("Invalid parameter set defined: %s."
                         "Expected 'base' or 'big.'" % FLAGS.params)

    # Set up estimator and params
    params.beam_size = _BEAM_SIZE
    params.alpha = _ALPHA
    params.extra_decode_length = _EXTRA_DECODE_LENGTH
    params.batch_size = _DECODE_BATCH_SIZE
    estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, subtokenizer, FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)

        output_file = None
        if FLAGS.file_out is not None:
            output_file = os.path.abspath(FLAGS.file_out)
            tf.logging.info("File output specified: %s" % output_file)

        translate_file(estimator, subtokenizer, input_file, output_file)

コード例 #9

0

ファイルを表示

ファイル: transformer_main.py プロジェクト: templeblock/Transformer_Tensorlayer

    def predict(self):
        """Predicts result from the model."""
        params, flags_obj, is_train = self.params, self.flags_obj, False

        with tf.name_scope("model"):
            model = transformer.create_model(params, is_train)
            self._load_weights_if_possible(
                model, tf.train.latest_checkpoint(self.flags_obj.model_dir))
            model.summary()
        subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file)

        ds = data_pipeline.eval_input_fn(params)
        ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE)
        ret = model.predict(ds)
        val_outputs, _ = ret
        length = len(val_outputs)
        for i in range(length):
            translate.translate_from_input(val_outputs[i], subtokenizer)

コード例 #10

0

ファイルを表示

ファイル: transformer_main_hvd.py プロジェクト: theJiangYu/reference

def evaluate_and_log_bleu(estimator, bleu_writer, bleu_source, bleu_ref):
  """Calculate and record the BLEU score."""
  subtokenizer = tokenizer.Subtokenizer(
      os.path.join(FLAGS.data_dir, FLAGS.vocab_file))

  uncased_score, cased_score = translate_and_compute_bleu(
      estimator, subtokenizer, bleu_source, bleu_ref)

  print("Bleu score (uncased):", uncased_score)
  print("Bleu score (cased):", cased_score)

  summary = tf.Summary(value=[
      tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score),
      tf.Summary.Value(tag="bleu/cased", simple_value=cased_score),
  ])

  bleu_writer.add_summary(summary, get_global_step(estimator))
  bleu_writer.flush()
  return uncased_score, cased_score

コード例 #11

0

ファイルを表示

def train_schedule(train_eval_iterations, single_iteration_train_steps, params,
                   bleu_source=None, bleu_ref=None,  bleu_threshold=None):
    """
    Train and evaluate model
    :param model: model to train
    :param train_eval_iterations: Number of times to repeat the train-eval iteration
    :param single_iteration_train_steps: Number of steps to train in one iteration
    :param bleu_source:File containing text to be translated for BLEU calculation.
    :param bleu_ref:File containing reference translations for BLEU calculation.
    :param bleu_threshold:minimum BLEU score before training is stopped.

    """
    print('Training schedule:')
    print('\t1.Train for %d iterations' % train_eval_iterations)
    print('\t2.Each iteration for %d steps.' % single_iteration_train_steps)
    print('\t3.Compute BLEU score.')
    '''if bleu_threshold is not None:
        print("Repeat above steps until the BLEU score reaches", bleu_threshold)
        train_eval_iterations = INF
    else:
        print("Repeat above steps %d times." % train_eval_iterations)'''

    # Loop training/evaluation/bleu cycles
    subtokenizer = tokenizer.Subtokenizer(vocab_file='vocab.ende.32768')
    dataset_train = dataset.TranslationDataset(dir_lang1='wmt32k-train.lang1',
                                               dir_lang2='wmt32k-train.lang2',
                                               subtokenizer=subtokenizer)
    global_step = 0
    best_bleu_score = 0
    net = transformer.Transformer(params=params, train=1)
    net.initialize(init=init.Xavier(), ctx=ctx, force_reinit=True)
    learning_rate = get_learning_rate(params.learning_rate, params.hidden_size,
                                      params.learning_rate_warmup_steps, global_step)
    optimizer = mx.optimizer.Adam(learning_rate=learning_rate, beta1=params.optimizer_adam_beta1,
                                  beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon)

    trainer = gluon.Trainer(net.collect_params(), optimizer=optimizer)
	bleu_score_file = open('blue_score_file', w+)

コード例 #12

0

ファイルを表示

ファイル: tutorial_v4.py プロジェクト: templeblock/Transformer_Tensorlayer

def train_model(input_params):
    params = model_params.EXAMPLE_PARAMS_v4
    dataset = train_input_fn(input_params)
    subtokenizer = tokenizer.Subtokenizer("data/data/" + VOCAB_FILE)
    input_file = "data/raw/dev/newstest2013.en"
    output_file = "./output/dev.de"

    ref_filename = "data/raw/dev/newstest2013.de"
    trace_path = "checkpoints_v4/logging/"
    num_epochs = 10

    def train_step(inputs, targets):
        model.train()
        with tf.GradientTape() as tape:
            #print(inputs)

            logits = model(inputs=inputs, targets=targets)
            logits = metrics.MetricLayer(params.vocab_size)([logits, targets])
            logits, loss = metrics.LossLayer(params.vocab_size,
                                             0.1)([logits, targets])

        gradients = tape.gradient(loss, model.all_weights)
        optimizer_.apply_gradients(zip(gradients, model.all_weights))
        return loss

    model = Transformer(params)
    # load_weights = tl.files.load_npz(name='./checkpoints_v4/model.npz')
    # tl.files.assign_weights(load_weights, model)
    learning_rate = CustomSchedule(
        params.hidden_size, warmup_steps=params.learning_rate_warmup_steps)
    optimizer_ = optimizer.LazyAdam(learning_rate,
                                    beta_1=0.9,
                                    beta_2=0.98,
                                    epsilon=1e-9)

    time_ = time.time()
    for epoch in range(num_epochs):
        total_loss, n_iter = 0, 0
        for i, [inputs, targets] in enumerate(dataset):
            loss = train_step(inputs, targets)
            with tf.io.gfile.GFile(trace_path + "loss", "ab+") as trace_file:
                trace_file.write(str(loss.numpy()) + '\n')
            if (i % 100 == 0):
                print('Batch ID {} at Epoch [{}/{}]: loss {:.4f} using {:.4f}'.
                      format(i, epoch + 1, num_epochs, loss,
                             (time.time() - time_) / 100))
                time_ = time.time()
            if (i % 2000 == 0):
                tl.files.save_npz(model.all_weights,
                                  name='./checkpoints_v4/model.npz')
            if (i == 5000):
                translate_file(model,
                               subtokenizer,
                               input_file=input_file,
                               output_file=output_file)
                try:
                    insensitive_score = bleu_wrapper(ref_filename, output_file,
                                                     False)
                    sensitive_score = bleu_wrapper(ref_filename, output_file,
                                                   True)
                    with tf.io.gfile.GFile(trace_path + "bleu_insensitive",
                                           "ab+") as trace_file:
                        trace_file.write(str(insensitive_score) + '\n')
                    with tf.io.gfile.GFile(trace_path + "bleu_sensitive",
                                           "ab+") as trace_file:
                        trace_file.write(str(sensitive_score) + '\n')
                except:
                    print("An exception occurred")

            total_loss += loss
            n_iter += 1

        # printing average loss after every epoch
        print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs,
                                                  total_loss / n_iter))
        # save model weights after every epoch
        tl.files.save_npz(model.all_weights, name='./checkpoints_v4/model.npz')

コード例 #13

0

ファイルを表示

ファイル: lightweight_translate.py プロジェクト: templeblock/Transformer_Tensorlayer

    model.eval()
    for i, text in enumerate(input_fn()):
        prediction = model(inputs=text)
        for i, single in enumerate(prediction["outputs"]):
            translation = _trim_and_decode(single, subtokenizer)
            translations.append(translation)

    # Write translations in the order they appeared in the original file.
    if output_file is not None:
        if tf.io.gfile.isdir(output_file):
            raise ValueError(
                "File output is a directory, will not save outputs to "
                "file.")
        # tf.logging.info("Writing to file %s" % output_file)
        with tf.io.gfile.GFile(output_file, "w") as f:
            for i in sorted_keys:
                f.write("%s\n" % translations[i])


if __name__ == "__main__":
    subtokenizer = tokenizer.Subtokenizer("data/data/" + VOCAB_FILE)
    params = model_params.EXAMPLE_PARAMS
    model = Transformer(params)
    load_weights = tl.files.load_npz(name='./checkpoints_light/model.npz')
    tl.files.assign_weights(load_weights, model)
    input_file = "./data/raw/dev/newstest2013.en"
    translate_file(model,
                   subtokenizer,
                   input_file,
                   output_file="./output/out.de")

コード例 #14

0

ファイルを表示

  tf.logging.info("Translation of \"%s\": \"%s\"" % (txt, translation))
  return translation


tf.logging.set_verbosity(tf.logging.INFO)

params = transformer_main.PARAMS_MAP["tiny"]
params["beam_size"] = _BEAM_SIZE
params["alpha"] = _ALPHA
params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
params["batch_size"] = _DECODE_BATCH_SIZE
estimator = tf.estimator.Estimator(
    model_fn=transformer_main.model_fn, model_dir="./tiny-model/",
    params=params)

subtokenizer = tokenizer.Subtokenizer("./tiny-model/vocab.ende.32768")

estimator=FastPredict(estimator,get_input_fn("بس",subtokenizer))

input_data = "حبيبي يا عاشق"

tf.logging.info("Translating text: %s" % input_data)
start = time.time()
print("started timing")


output_data = translate_text(estimator, subtokenizer, input_data)

end = time.time()
print("translate took %f seconds" % (end - start))