def main(unused_argv): import transformer_rl_main tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.logging.warn( "Nothing to translate. Make sure to call this script using " "flags --text or --file.") return subtokenizer_source = tokenizer.Subtokenizer( os.path.join(FLAGS.data_dir, 'vocab' + '.bpe.' + str(FLAGS.search) + '.' + FLAGS.fro)) subtokenizer_target = tokenizer.Subtokenizer( os.path.join(FLAGS.data_dir, 'vocab' + '.bpe.' + str(FLAGS.search) + '.' + FLAGS.to)) # Set up estimator and params params = transformer_rl_main.PARAMS_MAP[FLAGS.param_set] params.beam_size = _BEAM_SIZE params.alpha = _ALPHA params.extra_decode_length = _EXTRA_DECODE_LENGTH params.batch_size = _DECODE_BATCH_SIZE fp = open( os.path.join(FLAGS.data_dir, 'vocab.bpe.' + str(FLAGS.search) + "." + FLAGS.fro), 'r') lines = fp.readlines() params.source_vocab_size = len(lines) fp = open( os.path.join(FLAGS.data_dir, 'vocab.bpe.' + str(FLAGS.search) + "." + FLAGS.to), 'r') lines = fp.readlines() params.target_vocab_size = len(lines) estimator = tf.estimator.Estimator( model_fn=transformer_rl_main.gan_model_fn, model_dir=FLAGS.model_dir, params=params) if FLAGS.text is not None: tf.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer_source, subtokenizer_target, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.logging.info("Translating file: %s" % input_file) if not tf.gfile.Exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.logging.info("File output specified: %s" % output_file) translate_file(estimator, subtokenizer_source, subtokenizer_target, input_file, output_file)
def evaluate_and_log_bleu(estimator, bleu_source, bleu_ref, vocab_file, vocab_file_target): """Calculate and record the BLEU score.""" subtokenizer = tokenizer.Subtokenizer(vocab_file) subtokenizer_target = tokenizer.Subtokenizer(vocab_file_target) uncased_score, cased_score = translate_and_compute_bleu( estimator, subtokenizer, subtokenizer_target, bleu_source, bleu_ref) tf.logging.info("Bleu score (uncased): %d", uncased_score) tf.logging.info("Bleu score (cased): %d", cased_score) return uncased_score, cased_score
def evaluate_and_log_bleu(model, params, bleu_source, bleu_ref, vocab_file, distribution_strategy=None): """Calculate and record the BLEU score. Args: model: A Keras model, used to generate the translations. params: A dictionary, containing the translation related parameters. bleu_source: A file containing source sentences for translation. bleu_ref: A file containing the reference for the translated sentences. vocab_file: A file containing the vocabulary for translation. distribution_strategy: A platform distribution strategy, used for TPU based translation. Returns: uncased_score: A float, the case insensitive BLEU score. cased_score: A float, the case sensitive BLEU score. """ subtokenizer = tokenizer.Subtokenizer(vocab_file) uncased_score, cased_score = translate_and_compute_bleu( model, params, subtokenizer, bleu_source, bleu_ref, distribution_strategy) logging.info("Bleu score (uncased): %s", uncased_score) logging.info("Bleu score (cased): %s", cased_score) return uncased_score, cased_score
def _init_subtokenizer(self, vocab_list): temp_file = tempfile.NamedTemporaryFile(delete=False) with tf.gfile.Open(temp_file.name, 'w') as w: for subtoken in vocab_list: w.write("'%s'" % subtoken) w.write("\n") return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
def evaluate_and_log_bleu(model, bleu_source, bleu_ref, vocab_file): """Calculate and record the BLEU score.""" subtokenizer = tokenizer.Subtokenizer(vocab_file) uncased_score, cased_score = translate_and_compute_bleu( model, subtokenizer, bleu_source, bleu_ref) tf.compat.v1.logging.info("Bleu score (uncased): %s", uncased_score) tf.compat.v1.logging.info("Bleu score (cased): %s", cased_score) return uncased_score, cased_score
def main(unused_argv): import transformer_main tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.logging.warn( "Nothing to translate. Make sure to call this script using " "flags --text or --file.") return subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file) subtokenizer_target = tokenizer.Subtokenizer(FLAGS.vocab_file_en) # Set up estimator and params params = transformer_main.PARAMS_MAP[FLAGS.param_set] params["beam_size"] = _BEAM_SIZE params["alpha"] = _ALPHA params["extra_decode_length"] = _EXTRA_DECODE_LENGTH params["batch_size"] = _DECODE_BATCH_SIZE estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir, params=params) if FLAGS.text is not None: tf.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer, subtokenizer_target, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.logging.info("Translating file: %s" % input_file) if not tf.gfile.Exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.logging.info("File output specified: %s" % output_file) translate_file(estimator, subtokenizer, input_file, output_file)
def main(unused_argv): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.compat.v1.logging.warn("Nothing to translate. Make sure to call this script using " "flags --text or --file.") return subtokenizer = tokenizer.Subtokenizer( os.path.join(FLAGS.data_dir, FLAGS.vocab_file)) if FLAGS.params == "base": params = model_params.TransformerBaseParams elif FLAGS.params == "big": params = model_params.TransformerBigParams else: raise ValueError("Invalid parameter set defined: %s." "Expected 'base' or 'big.'" % FLAGS.params) # Set up estimator and params params.beam_size = _BEAM_SIZE params.alpha = _ALPHA params.extra_decode_length = _EXTRA_DECODE_LENGTH params.batch_size = FLAGS.batch_size params.frozen_graph = FLAGS.input_graph # Add inter_op and intra_op parallelism thread session_config = tf.compat.v1.ConfigProto( inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads, intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads, allow_soft_placement=True) run_config = tf.estimator.RunConfig(session_config=session_config) estimator = tf.estimator.Estimator( model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir, params=params, config=run_config) if FLAGS.text is not None: tf.compat.v1.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.compat.v1.logging.info("Translating file: %s" % input_file) if not tf.io.gfile.exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.compat.v1.logging.info("File output specified: %s" % output_file) translate_file(estimator, subtokenizer, input_file, output_file, FLAGS.batch_size, False)
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.logging.warn( "Nothing to translate. Make sure to call this script using " "flags --text or --file.") return subtokenizer = tokenizer.Subtokenizer( os.path.join(FLAGS.data_dir, FLAGS.vocab_file)) if FLAGS.params == "base": params = model_params.TransformerBaseParams elif FLAGS.params == "big": params = model_params.TransformerBigParams else: raise ValueError("Invalid parameter set defined: %s." "Expected 'base' or 'big.'" % FLAGS.params) # Set up estimator and params params.beam_size = _BEAM_SIZE params.alpha = _ALPHA params.extra_decode_length = _EXTRA_DECODE_LENGTH params.batch_size = _DECODE_BATCH_SIZE estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir, params=params) if FLAGS.text is not None: tf.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.logging.info("Translating file: %s" % input_file) if not tf.gfile.Exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.logging.info("File output specified: %s" % output_file) translate_file(estimator, subtokenizer, input_file, output_file)
def predict(self): """Predicts result from the model.""" params, flags_obj, is_train = self.params, self.flags_obj, False with tf.name_scope("model"): model = transformer.create_model(params, is_train) self._load_weights_if_possible( model, tf.train.latest_checkpoint(self.flags_obj.model_dir)) model.summary() subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file) ds = data_pipeline.eval_input_fn(params) ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE) ret = model.predict(ds) val_outputs, _ = ret length = len(val_outputs) for i in range(length): translate.translate_from_input(val_outputs[i], subtokenizer)
def evaluate_and_log_bleu(estimator, bleu_writer, bleu_source, bleu_ref): """Calculate and record the BLEU score.""" subtokenizer = tokenizer.Subtokenizer( os.path.join(FLAGS.data_dir, FLAGS.vocab_file)) uncased_score, cased_score = translate_and_compute_bleu( estimator, subtokenizer, bleu_source, bleu_ref) print("Bleu score (uncased):", uncased_score) print("Bleu score (cased):", cased_score) summary = tf.Summary(value=[ tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score), tf.Summary.Value(tag="bleu/cased", simple_value=cased_score), ]) bleu_writer.add_summary(summary, get_global_step(estimator)) bleu_writer.flush() return uncased_score, cased_score
def train_schedule(train_eval_iterations, single_iteration_train_steps, params, bleu_source=None, bleu_ref=None, bleu_threshold=None): """ Train and evaluate model :param model: model to train :param train_eval_iterations: Number of times to repeat the train-eval iteration :param single_iteration_train_steps: Number of steps to train in one iteration :param bleu_source:File containing text to be translated for BLEU calculation. :param bleu_ref:File containing reference translations for BLEU calculation. :param bleu_threshold:minimum BLEU score before training is stopped. """ print('Training schedule:') print('\t1.Train for %d iterations' % train_eval_iterations) print('\t2.Each iteration for %d steps.' % single_iteration_train_steps) print('\t3.Compute BLEU score.') '''if bleu_threshold is not None: print("Repeat above steps until the BLEU score reaches", bleu_threshold) train_eval_iterations = INF else: print("Repeat above steps %d times." % train_eval_iterations)''' # Loop training/evaluation/bleu cycles subtokenizer = tokenizer.Subtokenizer(vocab_file='vocab.ende.32768') dataset_train = dataset.TranslationDataset(dir_lang1='wmt32k-train.lang1', dir_lang2='wmt32k-train.lang2', subtokenizer=subtokenizer) global_step = 0 best_bleu_score = 0 net = transformer.Transformer(params=params, train=1) net.initialize(init=init.Xavier(), ctx=ctx, force_reinit=True) learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps, global_step) optimizer = mx.optimizer.Adam(learning_rate=learning_rate, beta1=params.optimizer_adam_beta1, beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) trainer = gluon.Trainer(net.collect_params(), optimizer=optimizer) bleu_score_file = open('blue_score_file', w+)
def train_model(input_params): params = model_params.EXAMPLE_PARAMS_v4 dataset = train_input_fn(input_params) subtokenizer = tokenizer.Subtokenizer("data/data/" + VOCAB_FILE) input_file = "data/raw/dev/newstest2013.en" output_file = "./output/dev.de" ref_filename = "data/raw/dev/newstest2013.de" trace_path = "checkpoints_v4/logging/" num_epochs = 10 def train_step(inputs, targets): model.train() with tf.GradientTape() as tape: #print(inputs) logits = model(inputs=inputs, targets=targets) logits = metrics.MetricLayer(params.vocab_size)([logits, targets]) logits, loss = metrics.LossLayer(params.vocab_size, 0.1)([logits, targets]) gradients = tape.gradient(loss, model.all_weights) optimizer_.apply_gradients(zip(gradients, model.all_weights)) return loss model = Transformer(params) # load_weights = tl.files.load_npz(name='./checkpoints_v4/model.npz') # tl.files.assign_weights(load_weights, model) learning_rate = CustomSchedule( params.hidden_size, warmup_steps=params.learning_rate_warmup_steps) optimizer_ = optimizer.LazyAdam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) time_ = time.time() for epoch in range(num_epochs): total_loss, n_iter = 0, 0 for i, [inputs, targets] in enumerate(dataset): loss = train_step(inputs, targets) with tf.io.gfile.GFile(trace_path + "loss", "ab+") as trace_file: trace_file.write(str(loss.numpy()) + '\n') if (i % 100 == 0): print('Batch ID {} at Epoch [{}/{}]: loss {:.4f} using {:.4f}'. format(i, epoch + 1, num_epochs, loss, (time.time() - time_) / 100)) time_ = time.time() if (i % 2000 == 0): tl.files.save_npz(model.all_weights, name='./checkpoints_v4/model.npz') if (i == 5000): translate_file(model, subtokenizer, input_file=input_file, output_file=output_file) try: insensitive_score = bleu_wrapper(ref_filename, output_file, False) sensitive_score = bleu_wrapper(ref_filename, output_file, True) with tf.io.gfile.GFile(trace_path + "bleu_insensitive", "ab+") as trace_file: trace_file.write(str(insensitive_score) + '\n') with tf.io.gfile.GFile(trace_path + "bleu_sensitive", "ab+") as trace_file: trace_file.write(str(sensitive_score) + '\n') except: print("An exception occurred") total_loss += loss n_iter += 1 # printing average loss after every epoch print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter)) # save model weights after every epoch tl.files.save_npz(model.all_weights, name='./checkpoints_v4/model.npz')
model.eval() for i, text in enumerate(input_fn()): prediction = model(inputs=text) for i, single in enumerate(prediction["outputs"]): translation = _trim_and_decode(single, subtokenizer) translations.append(translation) # Write translations in the order they appeared in the original file. if output_file is not None: if tf.io.gfile.isdir(output_file): raise ValueError( "File output is a directory, will not save outputs to " "file.") # tf.logging.info("Writing to file %s" % output_file) with tf.io.gfile.GFile(output_file, "w") as f: for i in sorted_keys: f.write("%s\n" % translations[i]) if __name__ == "__main__": subtokenizer = tokenizer.Subtokenizer("data/data/" + VOCAB_FILE) params = model_params.EXAMPLE_PARAMS model = Transformer(params) load_weights = tl.files.load_npz(name='./checkpoints_light/model.npz') tl.files.assign_weights(load_weights, model) input_file = "./data/raw/dev/newstest2013.en" translate_file(model, subtokenizer, input_file, output_file="./output/out.de")
tf.logging.info("Translation of \"%s\": \"%s\"" % (txt, translation)) return translation tf.logging.set_verbosity(tf.logging.INFO) params = transformer_main.PARAMS_MAP["tiny"] params["beam_size"] = _BEAM_SIZE params["alpha"] = _ALPHA params["extra_decode_length"] = _EXTRA_DECODE_LENGTH params["batch_size"] = _DECODE_BATCH_SIZE estimator = tf.estimator.Estimator( model_fn=transformer_main.model_fn, model_dir="./tiny-model/", params=params) subtokenizer = tokenizer.Subtokenizer("./tiny-model/vocab.ende.32768") estimator=FastPredict(estimator,get_input_fn("بس",subtokenizer)) input_data = "حبيبي يا عاشق" tf.logging.info("Translating text: %s" % input_data) start = time.time() print("started timing") output_data = translate_text(estimator, subtokenizer, input_data) end = time.time() print("translate took %f seconds" % (end - start))