def testTextInputterTest(self): vocab_src = Vocab(vocab_src_file) vocab_trg = Vocab(vocab_trg_file) dataset = Dataset(vocab_src, vocab_trg, train_src_file, train_trg_file, [eval_src_file], eval_trg_file) test_iter = TestTextIterator( train_src_file, vocab_src, batch_size=13) inputter = TextLineInputter( dataset, "eval_features_file", batch_size=13) input_fields = dataset.input_fields test_data = inputter.make_feeding_data() for a, b in zip(test_iter, test_data[0]): x_str = a[0] x = a[1][0] x_len = a[1][1] x_str_new = b[0] x_new = b[2][input_fields[Constants.FEATURE_IDS_NAME]] x_len_new = b[2][input_fields[Constants.FEATURE_LENGTH_NAME]] assert x.all() == x_new.all() assert x_len.all() == x_len_new.all() assert numpy.all([str1 == str2 for str1, str2 in zip(x_str, x_str_new)]) print("Test Passed...")
def run(self): """ Runs ensemble model. """ self._vocab_source = Vocab( filename=self._model_configs["infer"]["source_words_vocabulary"], bpe_codes_file=self._model_configs["infer"]["source_bpecodes"]) self._vocab_target = Vocab( filename=self._model_configs["infer"]["target_words_vocabulary"], bpe_codes_file=self._model_configs["infer"]["target_bpecodes"]) # build dataset dataset = Dataset(self._vocab_source, self._vocab_target, eval_features_file=[ p["features_file"] for p in self._model_configs["infer_data"] ]) estimator_spec = model_fn_ensemble( self._model_dirs, dataset, weight_scheme=self._weight_scheme, inference_options=self._model_configs["infer"]) predict_op = estimator_spec.predictions sess = self._build_default_session() text_inputter = TextLineInputter( dataset=dataset, data_field_name="eval_features_file", batch_size=self._model_configs["infer"]["batch_size"], maximum_line_length=None) sess.run(tf.global_variables_initializer()) tf.logging.info("Start inference.") overall_start_time = time.time() for feeding_data, param in zip(text_inputter.make_feeding_data(), self._model_configs["infer_data"]): tf.logging.info("Infer Source Features File: {}.".format( param["features_file"])) start_time = time.time() infer(sess=sess, prediction_op=predict_op, feeding_data=feeding_data, output=param["output_file"], vocab_target=self._vocab_target, alpha=self._model_configs["infer"]["length_penalty"], delimiter=self._model_configs["infer"]["delimiter"], output_attention=False, tokenize_output=self._model_configs["infer"]["char_level"], tokenize_script=self._model_configs["infer"] ["tokenize_script"], verbose=True) tf.logging.info("FINISHED {}. Elapsed Time: {}.".format( param["features_file"], str(time.time() - start_time))) if param["labels_file"] is not None: bleu_score = multi_bleu_score( self._model_configs["infer"]["multibleu_script"], param["labels_file"], param["output_file"]) tf.logging.info("BLEU score ({}): {}".format( param["features_file"], bleu_score)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def run(self): """ Trains the model. """ # vocabulary self._vocab_source = Vocab( filename=self._model_configs["data"]["source_words_vocabulary"], bpe_codes=self._model_configs["data"]["source_bpecodes"], reverse_seq=False) self._vocab_target = Vocab( filename=self._model_configs["data"]["target_words_vocabulary"], bpe_codes=self._model_configs["data"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["reverse_target"]) # build dataset dataset = Dataset( self._vocab_source, self._vocab_target, train_features_file=self._model_configs["data"]["train_features_file"], train_labels_file=self._model_configs["data"]["train_labels_file"], eval_features_file=self._model_configs["data"]["eval_features_file"], eval_labels_file=self._model_configs["data"]["eval_labels_file"]) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.TRAIN, dataset=dataset, name=self._model_configs["problem_name"]) train_op = estimator_spec.train_op hooks = estimator_spec.training_hooks # build training session sess = tf.train.MonitoredSession( session_creator=None, hooks=hooks) train_text_inputter = ParallelTextInputter( dataset, "train_features_file", "train_labels_file", self._model_configs["train"]["batch_size"], self._model_configs["train"]["batch_tokens_size"], self._model_configs["train"]["shuffle_every_epoch"]) train_data = train_text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, maximum_features_length=self._model_configs["train"]["maximum_features_length"], maximum_labels_length=self._model_configs["train"]["maximum_labels_length"]) eidx = 0 while True: if sess.should_stop(): break tf.logging.info("STARTUP Epoch {}".format(eidx)) for data in train_data: if sess.should_stop(): break sess.run(train_op, feed_dict=data["feed_dict"]) eidx += 1
def testTestDataLoader(self): vocab_src = Vocab(vocab_src_file) vocab_srcbpe = Vocab(vocab_srcbpe_file, bpe_codes_file=codes_src_file) data = TestTextIterator(train_src_file, vocab_srcbpe, batch_size=1) for x_str, (x, len_x) in data: print(x_str[0]) print(x[0]) print(' '.join(vocab_srcbpe.convert_to_wordlist(x[0]))) break
def run(self): """ Runs ensemble model. """ vocab_source = Vocab( filename=self._model_configs["infer"]["source_words_vocabulary"], bpe_codes=self._model_configs["infer"]["source_bpecodes"]) vocab_target = Vocab( filename=self._model_configs["infer"]["target_words_vocabulary"], bpe_codes=self._model_configs["infer"]["target_bpecodes"]) estimator_spec = model_fn_ensemble( self._model_dirs, vocab_source, vocab_target, weight_scheme=self._weight_scheme, inference_options=self._model_configs["infer"]) predict_op = estimator_spec.predictions sess = self._build_default_session() text_inputter = TextLineInputter(line_readers=[ LineReader( data=p["features_file"], preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)) for p in self._model_configs["infer_data"] ], padding_id=vocab_source.pad_id, batch_size=self. _model_configs["infer"]["batch_size"]) sess.run(tf.global_variables_initializer()) tf.logging.info("Start inference.") overall_start_time = time.time() for feeding_data, param in zip( text_inputter.make_feeding_data(estimator_spec.input_fields), self._model_configs["infer_data"]): tf.logging.info("Infer Source Features File: {}.".format( param["features_file"])) start_time = time.time() infer(sess=sess, prediction_op=predict_op, infer_data=feeding_data, output=param["output_file"], vocab_source=vocab_source, vocab_target=vocab_target, delimiter=self._model_configs["infer"]["delimiter"], output_attention=False, to_char_level=self._model_configs["infer"]["char_level"], verbose=True) tf.logging.info("FINISHED {}. Elapsed Time: {}.".format( param["features_file"], str(time.time() - start_time))) if param["labels_file"] is not None: bleu_score = multi_bleu_score_from_file( hypothesis_file=param["output_file"], references_files=param["labels_file"], char_level=self._model_configs["infer"]["char_level"]) tf.logging.info("BLEU score (%s): %.2f" % (param["features_file"], bleu_score)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def testParallelInputterEval(self): vocab_src = Vocab(vocab_src_file) vocab_trg = Vocab(vocab_trg_file) dataset = Dataset(vocab_src, vocab_trg, train_src_file, train_trg_file, eval_src_file, eval_trg_file) inputter = ParallelTextInputter(dataset, "eval_features_file", "eval_labels_file", batch_size=13, maximum_features_length=None, maximum_labels_length=None) eval_iter1 = EvalTextIterator(eval_src_file, eval_trg_file, vocab_src, vocab_trg, batch_size=13) eval_iter2 = TrainTextIterator(eval_src_file, eval_trg_file + "0", vocab_src, vocab_trg, batch_size=13, maxlen_src=1000, maxlen_trg=1000) input_fields = dataset.input_fields eval_data = inputter.make_feeding_data() for a, b, c in zip(eval_iter1, eval_iter2, eval_data): x1 = a[0][0] x_len1 = a[0][1] y1 = a[1][0] y_len1 = a[1][1] x2 = b[0][0] x_len2 = b[0][1] y2 = b[1][0] y_len2 = b[1][1] x_new = c[1][input_fields[Constants.FEATURE_IDS_NAME]] x_len_new = c[1][input_fields[Constants.FEATURE_LENGTH_NAME]] y_new = c[1][input_fields[Constants.LABEL_IDS_NAME]] y_len_new = c[1][input_fields[Constants.LABEL_LENGTH_NAME]] assert x1.all() == x_new.all() == x2.all() assert x_len1.all() == x_len_new.all() == x_len2.all() assert y1.all() == y_new.all() == y2.all() assert y_len1.all() == y_len_new.all() == y_len2.all() print("Test Passed...")
def testEvalDataLoader(self): vocab_src = Vocab(vocab_src_file) vocab_trg = Vocab(vocab_trg_file) vocab_srcbpe = Vocab(vocab_srcbpe_file, bpe_codes=codes_src_file) vocab_trgbpe = Vocab(vocab_trgbpe_file, bpe_codes=codes_trg_file) data = EvalTextIterator( train_src_file, train_trg_file, vocab_src, vocab_trg, batch_size=1) for (x, len_x), (y, len_y) in data: print (x[0]) print (' '.join(vocab_src.convert_to_wordlist(x[0]))) print (y[0]) print (' '.join(vocab_trg.convert_to_wordlist(y[0]))) break
def testParallelInputterTrain(self): vocab_src = Vocab(vocab_src_file) vocab_trg = Vocab(vocab_trg_file) dataset = Dataset(vocab_src, vocab_trg, train_src_file, train_trg_file, eval_src_file, eval_trg_file) inputter = ParallelTextInputter(dataset, "train_features_file", "train_labels_file", batch_size=13, maximum_features_length=20, maximum_labels_length=20) inputter._cache_size = 10 train_iter = TrainTextIterator(train_src_file, train_trg_file, vocab_src, vocab_trg, batch_size=13, maxlen_src=20, maxlen_trg=20) train_iter.k = 10 input_fields = dataset.input_fields train_data = inputter.make_feeding_data() for a, b in zip(train_iter, train_data): x = a[0][0] x_len = a[0][1] y = a[1][0] y_len = a[1][1] x_new = b[1][input_fields[Constants.FEATURE_IDS_NAME]] x_len_new = b[1][input_fields[Constants.FEATURE_LENGTH_NAME]] y_new = b[1][input_fields[Constants.LABEL_IDS_NAME]] y_len_new = b[1][input_fields[Constants.LABEL_LENGTH_NAME]] assert x.all() == x_new.all() assert x_len.all() == x_len_new.all() assert y.all() == y_new.all() assert y_len.all() == y_len_new.all() print("Test Passed...")
def read_plot_alignment_matrices(f, target_file=None, vocab_file=None, start=0): vocab = None if vocab_file: vocab = Vocab(filename=vocab_file) targets = None # if target_file: # targets = [] # if os.path.exists(target_file): # for line in open(target_file, "r"): # targets.append(["ref: " + line.strip()]) # else: # targets = [] # idx = 0 # while True: # if not os.path.exists(target + str(idx)): # break # targets.append(open(target + str(idx), 'r')) # idx += 1 # target = targets attentions = json.load(f, encoding="utf-8") for idx, att in attentions.items(): if idx < start: continue source_labels = att["source"].split() + ["SEQUENCE_END"] target_labels = att["translation"].split() att_list = att["attentions"] assert att_list[0][ "type"] == "simple", "Do not use this tool for multihead attention." mma = numpy.array(att_list[0]["value"]) if mma.shape[0] == len(target_labels) + 1: target_labels += ["SEQUENCE_END"] if vocab: source_labels = [ e if vocab[e] != vocab.unk_id else e + "(UNK)" for e in source_labels ] plot_head_map(mma, target_labels, source_labels)
def run(self): """Infers data files. """ # build datasets self._vocab_source = Vocab( filename=self._model_configs["eval"]["source_words_vocabulary"], bpe_codes=self._model_configs["eval"]["source_bpecodes"], reverse_seq=False) self._vocab_target = Vocab( filename=self._model_configs["eval"]["target_words_vocabulary"], bpe_codes=self._model_configs["eval"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["reverse_target"]) # build dataset dataset = Dataset( self._vocab_source, self._vocab_target, eval_features_file=[p["features_file"] for p in self._model_configs["eval_data"]], eval_labels_file=[p["labels_file"] for p in self._model_configs["eval_data"]]) # update evaluation model config self._model_configs, metric_str = update_eval_metric( self._model_configs, self._model_configs["eval"]["metric"]) tf.logging.info("Evaluating using {}".format(metric_str)) # build model estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.EVAL, dataset=dataset, name=self._model_configs["problem_name"]) sess = self._build_default_session() do_bucketing = (sum([p["output_attention"] for p in self._model_configs["eval_data"]]) == 0) text_inputter = ParallelTextInputter( dataset=dataset, features_field_name="eval_features_file", labels_field_name="eval_labels_file", batch_size=self._model_configs["eval"]["batch_size"], bucketing=do_bucketing) # reload checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"]) if checkpoint_path: tf.logging.info("reloading models...") saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: raise OSError("File NOT Found. Fail to load checkpoint file from: {}" .format(self._model_configs["model_dir"])) tf.logging.info("Start evaluation.") overall_start_time = time.time() for eval_data, param in zip(text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, in_memory=True), self._model_configs["eval_data"]): tf.logging.info("Evaluation Source File: {}.".format(param["features_file"])) tf.logging.info("Evaluation Target File: {}.".format(param["labels_file"])) start_time = time.time() result = evaluate_with_attention( sess=sess, eval_op=estimator_spec.loss, eval_data=eval_data, vocab_source=self._vocab_source, vocab_target=self._vocab_target, attention_op=estimator_spec.predictions \ if param["output_attention"] else None, output_filename_prefix=param["labels_file"].strip().split("/")[-1]) tf.logging.info("FINISHED {}. Elapsed Time: {}." .format(param["features_file"], str(time.time() - start_time))) tf.logging.info("Evaluation Score ({} on {}): {}" .format(metric_str, param["features_file"], result)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def run(self): """Infers data files. """ # build datasets self._vocab_source = Vocab( filename=self._model_configs["infer"]["source_words_vocabulary"], bpe_codes=self._model_configs["infer"]["source_bpecodes"], reverse_seq=False) self._vocab_target = Vocab( filename=self._model_configs["infer"]["target_words_vocabulary"], bpe_codes=self._model_configs["infer"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["reverse_target"]) # build dataset dataset = Dataset( self._vocab_source, self._vocab_target, eval_features_file=[p["features_file"] for p in self._model_configs["infer_data"]]) self._model_configs = update_infer_params( self._model_configs, beam_size=self._model_configs["infer"]["beam_size"], maximum_labels_length=self._model_configs["infer"]["maximum_labels_length"], length_penalty=self._model_configs["infer"]["length_penalty"]) # build model estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, dataset=dataset, name=self._model_configs["problem_name"]) predict_op = estimator_spec.predictions sess = self._build_default_session() text_inputter = TextLineInputter( dataset=dataset, data_field_name="eval_features_file", batch_size=self._model_configs["infer"]["batch_size"]) # reload checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"]) if checkpoint_path: tf.logging.info("reloading models...") saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: raise OSError("File NOT Found. Fail to find checkpoint file from: {}" .format(self._model_configs["model_dir"])) tf.logging.info("Start inference.") overall_start_time = time.time() for infer_data, param in zip(text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields), self._model_configs["infer_data"]): tf.logging.info("Infer Source File: {}.".format(param["features_file"])) start_time = time.time() infer(sess=sess, prediction_op=predict_op, infer_data=infer_data, output=param["output_file"], vocab_source=self._vocab_source, vocab_target=self._vocab_target, delimiter=self._model_configs["infer"]["delimiter"], output_attention=param["output_attention"], tokenize_output=self._model_configs["infer"]["char_level"], verbose=True) tf.logging.info("FINISHED {}. Elapsed Time: {}." .format(param["features_file"], str(time.time() - start_time))) if param["labels_file"] is not None: bleu_score = multi_bleu_score_from_file( hypothesis_file=param["output_file"], references_files=param["labels_file"], char_level=self._model_configs["infer"]["char_level"]) tf.logging.info("BLEU score (%s): %.2f" % (param["features_file"], bleu_score)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def run(self): """ Trains the model. """ # vocabulary vocab_source = Vocab( filename=self._model_configs["data"]["source_words_vocabulary"], bpe_codes=self._model_configs["data"]["source_bpecodes"], reverse_seq=self._model_configs["train"]["features_r2l"]) vocab_target = Vocab( filename=self._model_configs["data"]["target_words_vocabulary"], bpe_codes=self._model_configs["data"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["labels_r2l"]) eval_dataset = { "vocab_source": vocab_source, "vocab_target": vocab_target, "features_file": self._model_configs["data"]["eval_features_file"], "labels_file": self._model_configs["data"]["eval_labels_file"] } config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.TRAIN, vocab_source=vocab_source, vocab_target=vocab_target, name=self._model_configs["problem_name"]) train_ops = estimator_spec.train_ops hooks = estimator_spec.training_hooks # build training session sess = tf.train.MonitoredSession( session_creator=tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(), checkpoint_dir=None, master="", config=config), hooks=tuple(hooks) + tuple( build_eval_metrics(self._model_configs, eval_dataset, model_name=estimator_spec.name))) train_text_inputter = ParallelTextInputter( LineReader( data=self._model_configs["data"]["train_features_file"], maximum_length=self._model_configs["train"] ["maximum_features_length"], preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)), LineReader( data=self._model_configs["data"]["train_labels_file"], maximum_length=self._model_configs["train"] ["maximum_labels_length"], preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)), vocab_source.pad_id, vocab_target.pad_id, batch_size=self._model_configs["train"]["batch_size"], batch_tokens_size=self._model_configs["train"] ["batch_tokens_size"], shuffle_every_epoch=self._model_configs["train"] ["shuffle_every_epoch"], fill_full_batch=True, bucketing=True) train_data = train_text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields) eidx = [0, 0] update_cycle = [self._model_configs["train"]["update_cycle"], 1] def step_fn(step_context): step_context.session.run(train_ops["zeros_op"]) try: while update_cycle[0] != update_cycle[1]: data = train_data.next() step_context.session.run(train_ops["collect_op"], feed_dict=data["feed_dict"]) update_cycle[1] += 1 data = train_data.next() update_cycle[1] = 1 return step_context.run_with_hooks(train_ops["train_op"], feed_dict=data["feed_dict"]) except StopIteration: eidx[1] += 1 while not sess.should_stop(): if eidx[0] != eidx[1]: tf.logging.info("STARTUP Epoch {}".format(eidx[1])) eidx[0] = eidx[1] sess.run_step_fn(step_fn)