def __init__(self, source, target, vocab_source, vocab_target, batch_size=80, maxlen_src=50, maxlen_trg=100, n_words_src=-1, n_words_trg=-1, shuffle_every_epoch=None, shuffle_before_train=None): """ :param source: `str` :param target: `str` :param vocab_source: `Vocab` :param vocab_target: `Vocab` :param batch_size: `int` :param maxlen_src: `int` :param maxlen_trg: `int` :param n_words_src: `int` :param n_words_trg: `int` :param shuffle_every_epoch: if is not None, use it as postfix of shuffled data :param shuffle_before_train: if is not None, use it as postfix of shuffled data :return: """ if shuffle_before_train: tf.logging.info("shuffling data before training\n" "\t%s ==> %s\n\t%s ==> %s" % (source, "./source.shuf." + shuffle_before_train, target, "./target.shuf." + shuffle_before_train)) shuffle_data([source, target], [ "./source.shuf." + shuffle_before_train, "./target.shuf." + shuffle_before_train ]) source = "./source.shuf." + shuffle_before_train target = "./target.shuf." + shuffle_before_train self.source_file = source self.target_file = target self.source = open_file(source, encoding='utf-8') self.target = open_file(target, encoding='utf-8') self.vocab_source = vocab_source self.vocab_target = vocab_target self.batch_size = batch_size self.maxlen_src = maxlen_src self.maxlen_trg = maxlen_trg self.n_words_src = n_words_src self.n_words_trg = n_words_trg self.source_buffer = [] self.target_buffer = [] self.k = batch_size * 128 self.end_of_data = False self.shuffle_every_epoch = shuffle_every_epoch
def _shuffle_and_reopen(self): """ shuffle features & labels file. """ if self._parent._shuffle_every_epoch: if not hasattr(self, "_shuffled_features_file"): self._shuffled_features_file = self._features_file.strip().split("/")[-1] \ + "." + self._parent._shuffle_every_epoch self._shuffled_labels_file = self._labels_file.strip().split("/")[-1] \ + "." + self._parent._shuffle_every_epoch tf.logging.info( "shuffling data\n\t{} ==> {}\n\t{} ==> {}".format( self._features_file, self._shuffled_features_file, self._labels_file, self._shuffled_labels_file)) shuffle_data( [self._features_file, self._labels_file], [self._shuffled_features_file, self._shuffled_labels_file]) self._features_file = self._shuffled_features_file self._labels_file = self._shuffled_labels_file if hasattr(self, "_features"): close_file(self._features) close_file(self._labels) elif hasattr(self, "_features"): self._features.seek(0) self._labels.seek(0) return self._features, self._labels return open_file(self._features_file), open_file(self._labels_file)
def _SmallParallelData(self, features_file, labels_file, maximum_features_length=None, maximum_labels_length=None): """ Function for reading small scale parallel data for evaluation. Args: features_file: The path of features file. labels_file: The path of labels file. maximum_features_length: The maximum length of feature symbols (especially after BPE is applied) . If provided, the number of symbols of one sentence exceeding this value will be ignore. maximum_labels_length: The maximum length of label symbols (especially after BPE is applied) . If provided, the number of symbols of one sentence exceeding this value will be ignore. Returns: A list of feeding data. """ features = open_file(features_file, encoding="utf-8") labels = open_file(labels_file[0], encoding="utf-8") ss_buf = [] tt_buf = [] while True: ss = read_line_with_filter(features, maximum_features_length, self._features_preprocessing_fn) tt = read_line_with_filter(labels, maximum_labels_length, self._labels_preprocessing_fn) if ss == "" or tt == "": break ss_buf.append(ss) tt_buf.append(tt) close_file(features) close_file(labels) if self._bucketing: tt_buf, ss_buf = do_bucketing(tt_buf, [ss_buf]) ss_buf = ss_buf[0] data = [] batch_data_idx = 0 while batch_data_idx < len(ss_buf): x, len_x = padding_batch_data( ss_buf[batch_data_idx:batch_data_idx + self._batch_size], self._features_padding) y, len_y = padding_batch_data( tt_buf[batch_data_idx:batch_data_idx + self._batch_size], self._labels_padding) data.append({ "feature_ids": x, "label_ids": y, "feed_dict": { self.input_fields[Constants.FEATURE_IDS_NAME]: x, self.input_fields[Constants.FEATURE_LENGTH_NAME]: len_x, self.input_fields[Constants.LABEL_IDS_NAME]: y, self.input_fields[Constants.LABEL_LENGTH_NAME]: len_y } }) batch_data_idx += self._batch_size return data
def _SmallParallelData(self, features_file, labels_file, input_fields, maximum_features_length=None, maximum_labels_length=None): """ Function for reading small scale parallel data for evaluation. Args: features_file: The path of features file. labels_file: The path of labels file. input_fields: A dict of placeholders. maximum_features_length: The maximum length of feature symbols (especially after BPE is applied) . If provided, the number of symbols of one sentence exceeding this value will be ignore. maximum_labels_length: The maximum length of label symbols (especially after BPE is applied) . If provided, the number of symbols of one sentence exceeding this value will be ignore. Returns: A list of feeding data. """ features = open_file(features_file, encoding="utf-8") labels = open_file(labels_file[0], encoding="utf-8") ss_buf = [] tt_buf = [] while True: ss = read_line_with_filter(features, maximum_features_length, self._features_preprocessing_fn) tt = read_line_with_filter(labels, maximum_labels_length, self._labels_preprocessing_fn) if ss == "" or tt == "": break ss_buf.append(ss) tt_buf.append(tt) close_file(features) close_file(labels) if self._bucketing: tt_buf, ss_buf = do_bucketing(tt_buf, [ss_buf]) ss_buf = ss_buf[0] data = [] batch_data_idx = 0 while batch_data_idx < len(ss_buf): data.append( pack_feed_dict( name_prefixs=[ Constants.FEATURE_NAME_PREFIX, Constants.LABEL_NAME_PREFIX ], origin_datas=[ ss_buf[batch_data_idx:batch_data_idx + self._batch_size], tt_buf[batch_data_idx:batch_data_idx + self._batch_size] ], paddings=[self._features_padding, self._labels_padding], input_fields=input_fields)) batch_data_idx += self._batch_size return data
def _EvalParallelData(self, features_file, labels_file): """ Function for reading small scale parallel data for evaluation. Args: features_file: The path of features file. labels_file: The path of labels file. Returns: A list of feeding data. """ eval_features = open_file(features_file, encoding="utf-8") if gfile.Exists(labels_file): eval_labels = open_file(labels_file, encoding="utf-8") else: eval_labels = open_file(labels_file + "0", encoding="utf-8") ss_buf = [] tt_buf = [] ss_str_buf = [] tt_str_buf = [] for ss, tt in zip(eval_features, eval_labels): ss_str = self._vocab_source.bpe_encode(ss.strip()).split() tt_str = self._vocab_target.bpe_encode(tt.strip()).split() ss_str_buf.append(ss_str) tt_str_buf.append(tt_str) ss_buf.append(self._vocab_source.convert_to_idlist(ss.strip())) tt_buf.append(self._vocab_target.convert_to_idlist(tt.strip())) close_file(eval_features) close_file(eval_labels) if self._bucketing: tlen = numpy.array([len(t) for t in tt_buf]) tidx = tlen.argsort() _ss_buf = [ss_buf[i] for i in tidx] _tt_buf = [tt_buf[i] for i in tidx] _ss_str_buf = [ss_str_buf[i] for i in tidx] _tt_str_buf = [tt_str_buf[i] for i in tidx] ss_buf = _ss_buf tt_buf = _tt_buf ss_str_buf = _ss_str_buf tt_str_buf = _tt_str_buf data = [] batch_data_idx = 0 while batch_data_idx < len(ss_buf): x, len_x = padding_batch_data( ss_buf[batch_data_idx:batch_data_idx + self._batch_size], self._vocab_source.eos_id) y, len_y = padding_batch_data( tt_buf[batch_data_idx:batch_data_idx + self._batch_size], self._vocab_target.eos_id) data.append( (ss_str_buf[batch_data_idx:batch_data_idx + self._batch_size], tt_str_buf[batch_data_idx:batch_data_idx + self._batch_size], { self.input_fields[GlobalNames.PH_FEATURE_IDS_NAME]: x, self.input_fields[GlobalNames.PH_FEATURE_LENGTH_NAME]: len_x, self.input_fields[GlobalNames.PH_LABEL_IDS_NAME]: y, self.input_fields[GlobalNames.PH_LABEL_LENGTH_NAME]: len_y })) batch_data_idx += self._batch_size return data
def _reset(self): if self._parent._shuffle_every_epoch: close_file(self._features) close_file(self._labels) self._shuffle() self._features = open_file(self._features_file, encoding="utf-8") self._labels = open_file(self._labels_file, encoding="utf-8") self._features.seek(0) self._labels.seek(0)
def __init__(self, source, target, vocab_source, vocab_target, batch_size=80, maxlen_src=50, maxlen_trg=100, n_words_src=-1, n_words_trg=-1, shuffle_every_epoch=None, shuffle_before_train=None): """ :param source: `str` :param target: `str` :param vocab_source: `Vocab` :param vocab_target: `Vocab` :param batch_size: `int` :param maxlen_src: `int` :param maxlen_trg: `int` :param n_words_src: `int` :param n_words_trg: `int` :param shuffle_every_epoch: if is not None, use it as postfix of shuffled data :param shuffle_before_train: if is not None, use it as postfix of shuffled data :return: """ if shuffle_before_train: tf.logging.info("shuffling data before training\n" "\t%s ==> %s\n\t%s ==> %s" % (source, "./source.shuf." + shuffle_before_train, target, "./target.shuf." + shuffle_before_train)) shuffle_data([source, target], ["./source.shuf." + shuffle_before_train, "./target.shuf." + shuffle_before_train]) source = "./source.shuf." + shuffle_before_train target = "./target.shuf." + shuffle_before_train self.source_file = source self.target_file = target self.source = open_file(source, encoding='utf-8') self.target = open_file(target, encoding='utf-8') self.vocab_source = vocab_source self.vocab_target = vocab_target self.batch_size = batch_size self.maxlen_src = maxlen_src self.maxlen_trg = maxlen_trg self.n_words_src = n_words_src self.n_words_trg = n_words_trg self.source_buffer = [] self.target_buffer = [] self.k = batch_size * 128 self.end_of_data = False self.shuffle_every_epoch = shuffle_every_epoch
def reset(self): if self.shuffle_every_epoch: close_file(self.source) close_file(self.target) tf.logging.info("shuffling data among epochs") shuffle_data([self.source_file, self.target_file], ["./source.shuf." + self.shuffle_every_epoch, "./target.shuf." + self.shuffle_every_epoch]) self.source = open_file("./source.shuf." + self.shuffle_every_epoch) self.target = open_file("./target.shuf." + self.shuffle_every_epoch) else: self.source.seek(0) self.target.seek(0)
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). """ features_file = self._dataset["features_file"] labels_file = self._dataset["labels_file"] vocab_source = self._dataset["vocab_source"] vocab_target = self._dataset["vocab_target"] self._model_configs = update_infer_params( # update inference parameters self._model_configs, beam_size=self._beam_size, maximum_labels_length=self._maximum_labels_length, length_penalty=self._length_penalty) estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, vocab_source=vocab_source, vocab_target=vocab_target, name=self._model_name, reuse=True, verbose=False) self._predict_ops = estimator_spec.predictions text_inputter = TextLineInputter( line_readers=LineReader( data=features_file, preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)), padding_id=vocab_source.pad_id, batch_size=self._batch_size) self._infer_data = text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields) tmp_trans_dir = os.path.join(self._model_configs["model_dir"], Constants.TMP_TRANS_DIRNAME) if not gfile.Exists(tmp_trans_dir): gfile.MakeDirs(tmp_trans_dir) self._tmp_trans_file_prefix = os.path.join(tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX) self._read_ckpt_bleulog() # load references self._references = [] for rfile in access_multiple_files(labels_file): with open_file(rfile) as fp: if self._char_level: self._references.append(to_chinese_char(fp.readlines())) else: self._references.append(fp.readlines()) self._references = list(map(list, zip(*self._references))) with open_file(features_file) as fp: self._sources = fp.readlines() self._bad_count = 0 self._best_bleu_score = 0.
def _read_ckpt_bleulog(self): """Read the best BLEU scores and the name of corresponding checkpoint archives from log file.""" if gfile.Exists(self._top_bleu_ckpt_log_filename): with open_file(self._top_bleu_ckpt_log_filename, mode="r") as fp: self._best_checkpoint_bleus = [float(x) for x in fp.readline().strip().split(",")] self._best_checkpoint_names = [x for x in fp.readline().strip().split(",")]
def __init__(self, source, vocab_source, batch_size=1, n_words_src=-1): # read in batch datas f_source = open_file(source) ss_buf = [] ss_str_buf = [] for ss in f_source: # ss_str_buf.append(ss.strip()) ss_str_buf.append(vocab_source.bpe_encode(ss.strip())) ss = vocab_source.convert_to_idlist(ss.strip().split(), n_words_src) ss_buf.append(ss) f_source.close() self.batch_source_buffer = [] self.batch_source_str_buffer = [] self.batch_data_idx = 0 self.batch_size = batch_size while self.batch_data_idx < len(ss_buf): self.batch_source_buffer.append( padding_batch_data( ss_buf[self.batch_data_idx:self.batch_data_idx + batch_size], vocab_source.eos_id)) self.batch_source_str_buffer.append( ss_str_buf[self.batch_data_idx:self.batch_data_idx + batch_size]) self.batch_data_idx += batch_size self.reset()
def reset(self): if self.shuffle_every_epoch: close_file(self.source) close_file(self.target) tf.logging.info("shuffling data among epochs") shuffle_data([self.source_file, self.target_file], [ "./source.shuf." + self.shuffle_every_epoch, "./target.shuf." + self.shuffle_every_epoch ]) self.source = open_file("./source.shuf." + self.shuffle_every_epoch) self.target = open_file("./target.shuf." + self.shuffle_every_epoch) else: self.source.seek(0) self.target.seek(0)
def _make_feeding_data_from(self, filename): """ Processes the data file and return an iterable instance for loop. Args: filename: A specific data file. Returns: An iterable instance that packs feeding dictionary for `tf.Session().run` according to the `filename`. """ features = open_file(filename, encoding="utf-8") str_buf = [] ss_buf = [] for ss in features: str_buf.append(self._vocab.bpe_encode(ss.strip())) ss_buf.append(self._vocab.convert_to_idlist(ss.strip().split(" "))) close_file(features) data = [] batch_data_idx = 0 while batch_data_idx < len(ss_buf): x, len_x = padding_batch_data( ss_buf[batch_data_idx: batch_data_idx + self._batch_size], self._vocab.eos_id) str_x = str_buf[batch_data_idx: batch_data_idx + self._batch_size] batch_data_idx += self._batch_size data.append(( str_x, len_x, {self.input_fields[GlobalNames.PH_FEATURE_IDS_NAME]: x, self.input_fields[GlobalNames.PH_FEATURE_LENGTH_NAME]: len_x})) return data
def __init__(self, source, vocab_source, batch_size=1, n_words_src=-1): # read in batch datas f_source = open_file(source) ss_buf = [] ss_str_buf = [] for ss in f_source: # ss_str_buf.append(ss.strip()) ss_str_buf.append(vocab_source.bpe_encode(ss.strip())) ss = vocab_source.convert_to_idlist(ss.strip().split(), n_words_src) ss_buf.append(ss) f_source.close() self.batch_source_buffer = [] self.batch_source_str_buffer = [] self.batch_data_idx = 0 self.batch_size = batch_size while self.batch_data_idx < len(ss_buf): self.batch_source_buffer.append( padding_batch_data(ss_buf[self.batch_data_idx: self.batch_data_idx + batch_size], vocab_source.eos_id)) self.batch_source_str_buffer.append( ss_str_buf[self.batch_data_idx: self.batch_data_idx + batch_size]) self.batch_data_idx += batch_size self.reset()
def create_vocabulary_lookup_table_numpy(filename): """Creates a lookup table from a vocabulary file. Args: filename: Path to a vocabulary file containing one word per line. Each word is mapped to its line number (starting from 0). Returns: A tuple `(word_to_id_mapping, id_to_word_mapping, special_fields)` """ if not gfile.Exists(filename): raise ValueError("File does not exist: {}".format(filename)) # Load vocabulary into memory with open_file(filename, encoding="utf-8") as file: vocab = list(line.strip("\n") for line in file) vocab_size = len(vocab) has_counts = len(vocab[0].split("\t")) == 2 if has_counts: vocab, counts = zip(*[_.split("\t") for _ in vocab]) counts = [float(_) for _ in counts] vocab = list(vocab) else: counts = [-1. for _ in vocab] # Add special vocabulary items special_vocab = get_special_vocab(vocab_size) vocab += list(special_vocab._fields) vocab_size += len(special_vocab) counts += [-1. for _ in list(special_vocab._fields)] return {v: k for k, v in enumerate(vocab)}, \ {k: v for k, v in enumerate(vocab)}, \ special_vocab._fields
def load_from_config_path(config_paths): """ Loads configurations from files of yaml format. Args: config_paths: A string (each file name is seperated by ",") or a list of strings (file names). Returns: A dictionary of model configurations, parsed from config files. """ if isinstance(config_paths, six.string_types): config_paths = config_paths.strip().split(",") assert isinstance(config_paths, list) or isinstance(config_paths, tuple) model_configs = dict() for config_path in config_paths: config_path = config_path.strip() if not config_path: continue if not gfile.Exists(config_path): raise OSError("config file does not exist: {}".format(config_path)) config_path = os.path.abspath(config_path) tf.logging.info("loading configurations from {}".format(config_path)) with open_file(config_path, mode="r") as config_file: config_flags = yaml.load(config_file) model_configs = deep_merge_dict(model_configs, config_flags) return model_configs
def __init__(self, source, target, vocab_source, vocab_target, batch_size=128, n_words_src=-1, n_words_trg=-1): # read in batch datas f_source = open_file(source) if gfile.Exists(target): f_target = open_file(target) else: f_target = open_file(target + "0") ss_buf = [] tt_buf = [] for ss, tt in zip(f_source, f_target): ss = vocab_source.convert_to_idlist(ss.strip().split(), n_words_src) tt = vocab_target.convert_to_idlist(tt.strip().split(), n_words_trg) ss_buf.append(ss) tt_buf.append(tt) f_source.close() f_target.close() tlen = numpy.array([len(t) for t in tt_buf]) tidx = tlen.argsort() _ss_buf = [ss_buf[i] for i in tidx] _tt_buf = [tt_buf[i] for i in tidx] ss_buf = _ss_buf tt_buf = _tt_buf self.batch_source_buffer = [] self.batch_target_buffer = [] self.batch_data_idx = 0 self.batch_size = batch_size while self.batch_data_idx < len(ss_buf): self.batch_source_buffer.append( padding_batch_data( ss_buf[self.batch_data_idx:self.batch_data_idx + batch_size], vocab_source.eos_id)) self.batch_target_buffer.append( padding_batch_data( tt_buf[self.batch_data_idx:self.batch_data_idx + batch_size], vocab_target.eos_id)) self.batch_data_idx += batch_size self.reset()
def __init__(self, parent, features_file, labels_file, maximum_features_length=None, maximum_labels_length=None, maximum_encoded_features_length=None, maximum_encoded_labels_length=None): """ Initializes. Args: parent: A `ParallelTextInputter` object. features_file: The path of features file. labels_file: The path of labels file. maximum_features_length: The maximum sequence length of "features" field. If provided, sentences exceeding this value will be ignore. maximum_labels_length: The maximum sequence length of "labels" field. If provided, sentences exceeding this value will be ignore. maximum_encoded_features_length: The maximum length of feature symbols (especially after BPE is applied) . If provided, the number of symbols of one sentence exceeding this value will be ignore. maximum_encoded_labels_length: The maximum length of label symbols (especially after BPE is applied) . If provided, the number of symbols of one sentence exceeding this value will be ignore. """ self._parent = parent self._features_file = features_file self._labels_file = labels_file if not gfile.Exists(self._labels_file): self._labels_file = self._labels_file + "0" self._maximum_features_length = maximum_features_length self._maximum_labels_length = maximum_labels_length self._maximum_encoded_features_length = maximum_encoded_features_length self._maximum_encoded_labels_length = maximum_encoded_labels_length if self._parent._shuffle_every_epoch: self._shuffle_features_file = self._features_file.strip().split("/")[-1] \ + "." + self._parent._shuffle_every_epoch self._shuffle_labels_file = self._labels_file.strip().split("/")[-1] \ + "." + self._parent._shuffle_every_epoch self._shuffle() self._features = open_file(self._features_file, encoding="utf-8") self._labels = open_file(self._labels_file, encoding="utf-8") self._features_buffer = [] self._labels_buffer = [] self._features_len_buffer = [] self._labels_len_buffer = [] self._end_of_data = False
def _update_bleu_ckpt(self, run_context, bleu, hypothesis, global_step): """ Updates the best checkpoints according to BLEU score and removes the worst model if the number of checkpoint archives exceeds maximum_keep_models. If the model does not improves BLEU score anymore (hits the maximum patience), request stop session. Args: run_context: A `SessionRunContext` object. bleu: A python float, the BLEU score derived by the model at this step. hypothesis: A list of hypothesis for validation set. global_step: A python integer, the current training step. """ if bleu >= self._best_bleu_score: self._best_bleu_score = bleu self._bad_count = 0 else: self._bad_count += 1 if self._bad_count >= self._estop_patience_max and self._early_stop: tf.logging.info("early stop.") run_context.request_stop() # saving checkpoints if eval_steps and save_checkpoint_steps mismatch if len(self._best_checkpoint_names) == 0 or bleu > self._best_checkpoint_bleus[0]: with open_file(self._tmp_trans_file_prefix + str(global_step), mode="w") as fw: fw.write('\n'.join(hypothesis) + "\n") if not gfile.Exists("{}-{}.meta".format( os.path.join(self._checkpoint_dir, Constants.MODEL_CKPT_FILENAME), global_step)): saver = saver_lib._get_saver_or_default() saver.save(run_context.session, os.path.join(self._checkpoint_dir, Constants.MODEL_CKPT_FILENAME), global_step=global_step) backup_dirname = os.path.join(self._model_configs["model_dir"], "../") \ + "{dirname_prefix}_iter{global_step}_bleu{bleu}".format( dirname_prefix=Constants.BACKUP_MODEL_DIRNAME_PREFIX, global_step=global_step, bleu=("%.1f" % bleu)) tf.logging.info("Saving to directoruy: {}/".format(backup_dirname)) os.system("mkdir {backup_dirname};" "cp {ckpt_dirname}/checkpoint {backup_dirname}/;" "cp {ckpt_dirname}/{model_config} {backup_dirname}/;" "cp {ckpt_dirname}/{model_analysis} {backup_dirname}/;" "cp {ckpt_dirname}/*{global_step}* {backup_dirname}/".format( backup_dirname=backup_dirname, ckpt_dirname=self._checkpoint_dir, model_config=Constants.MODEL_CONFIG_YAML_FILENAME, model_analysis=Constants.MODEL_ANALYSIS_FILENAME, global_step=global_step)) self._best_checkpoint_bleus.append(bleu) self._best_checkpoint_names.append(backup_dirname) if len(self._best_checkpoint_bleus) > self._maximum_keep_models: tidx = numpy.argsort(self._best_checkpoint_bleus) _bleus = [self._best_checkpoint_bleus[i] for i in tidx] _names = [self._best_checkpoint_names[i] for i in tidx] self._best_checkpoint_bleus = _bleus[1:] self._best_checkpoint_names = _names[1:] os.system("rm -rf {}".format(_names[0])) self._write_ckpt_bleulog()
def multi_bleu_score_from_file(hypothesis_file, references_files): """ Computes corpus-level BLEU from hypothesis file and reference file(s). Args: hypothesis_file: A string. references_files: A string. The name of reference file or the prefix. Returns: A float. """ with open_file(hypothesis_file) as fp: hypothesis = fp.readlines() references = [] for ref_file in get_labels_files(references_files): with open_file(ref_file) as fp: references.append(fp.readlines()) references = list(map(list, zip(*references))) return multi_bleu_score(hypothesis, references)
def dump_attentions(output_filename_prefix, attentions): """ Dumps attention as json format. Args: output_filename_prefix: A string. attentions: A dict of attention arrays. """ tf.logging.info("Saving attention information into {}.attention.".format(output_filename_prefix)) with open_file(output_filename_prefix + ".attention", mode="wb") as f: f.write(json.dumps(attentions).encode("utf-8"))
def dump(model_config, output_dir): """ Dumps model configurations. Args: model_config: A dict. output_dir: A string, the output directory. """ model_config_filename = os.path.join(output_dir, Constants.MODEL_CONFIG_YAML_FILENAME) if not gfile.Exists(output_dir): gfile.MakeDirs(output_dir) with open_file(model_config_filename, mode="w") as file: yaml.dump(model_config, file)
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). """ text_inputter = TextLineInputter(dataset=self._dataset, data_field_name="eval_features_file", batch_size=self._batch_size) self._infer_data = text_inputter.make_feeding_data() self._model_configs = update_infer_params( # update inference parameters self._model_configs, beam_size=self._beam_size, maximum_labels_length=self._maximum_labels_length, length_penalty=self._length_penalty) estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, dataset=self._dataset, name=self._model_name, reuse=True, verbose=False) self._predict_ops = estimator_spec.predictions tmp_trans_dir = os.path.join(self._model_configs["model_dir"], Constants.TMP_TRANS_DIRNAME) if not gfile.Exists(tmp_trans_dir): gfile.MakeDirs(tmp_trans_dir) self._tmp_trans_file_prefix = os.path.join( tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX) self._read_ckpt_bleulog() # load references self._references = [] for rfile in self._dataset.eval_labels_file: with open_file(rfile) as fp: self._references.append(fp.readlines()) self._references = list(map(list, zip(*self._references))) with open_file(self._dataset.eval_features_file) as fp: self._sources = fp.readlines() self._bad_count = 0 self._best_bleu_score = 0.
def reset(self, do_shuffle=False, shuffle_to_file=None, argsort_index=None): """ Resets this reader and shuffle (if needed). Args: do_shuffle: Whether to shuffle data. shuffle_to_file: A string. argsort_index: A list of integers Returns: The `argsort_index` if do shuffling. """ # TODO self._data_index = 0 if self._filename is not None: self._data.seek(0) if do_shuffle: if self._filename is None: # list of data _ = shuffle_to_file if not argsort_index: argsort_index = numpy.arange(len(self._data)) numpy.random.shuffle(argsort_index) self._data = self._data[argsort_index] # do shuffle else: # from file assert shuffle_to_file, ("`shuffle_to_file` must be provided.") tf.logging.info("shuffling data:\t{} ==> {}".format( self._filename, shuffle_to_file)) data_list = self._data.readlines() close_file(self._data) if argsort_index is None: argsort_index = numpy.arange(len(data_list)) numpy.random.shuffle(argsort_index) with open_file(shuffle_to_file, "utf-8", "w") as fw: for idx in argsort_index: fw.write(data_list[idx].strip() + "\n") del data_list[:] self._data = open_file(shuffle_to_file, "utf-8", "r") return argsort_index
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). """ self._model_configs = update_infer_params( # update inference parameters self._model_configs, beam_size=self._beam_size, maximum_labels_length=self._maximum_labels_length, length_penalty=self._length_penalty) estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, dataset=self._dataset, name=self._model_name, reuse=True, verbose=False) self._predict_ops = estimator_spec.predictions text_inputter = TextLineInputter( dataset=self._dataset, data_field_name="eval_features_file", batch_size=self._batch_size) self._infer_data = text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields) tmp_trans_dir = os.path.join(self._model_configs["model_dir"], Constants.TMP_TRANS_DIRNAME) if not gfile.Exists(tmp_trans_dir): gfile.MakeDirs(tmp_trans_dir) self._tmp_trans_file_prefix = os.path.join(tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX) self._read_ckpt_bleulog() # load references self._references = [] for rfile in self._dataset.eval_labels_file: with open_file(rfile) as fp: if self._char_level: self._references.append(to_chinese_char(fp.readlines())) else: self._references.append(fp.readlines()) self._references = list(map(list, zip(*self._references))) with open_file(self._dataset.eval_features_file) as fp: self._sources = fp.readlines() self._bad_count = 0 self._best_bleu_score = 0.
def __init__(self, source, target, vocab_source, vocab_target, batch_size=128, n_words_src=-1, n_words_trg=-1): # read in batch datas f_source = open_file(source) if gfile.Exists(target): f_target = open_file(target) else: f_target = open_file(target + "0") ss_buf = [] tt_buf = [] for ss, tt in zip(f_source, f_target): ss = vocab_source.convert_to_idlist(ss.strip().split(), n_words_src) tt = vocab_target.convert_to_idlist(tt.strip().split(), n_words_trg) ss_buf.append(ss) tt_buf.append(tt) f_source.close() f_target.close() tlen = numpy.array([len(t) for t in tt_buf]) tidx = tlen.argsort() _ss_buf = [ss_buf[i] for i in tidx] _tt_buf = [tt_buf[i] for i in tidx] ss_buf = _ss_buf tt_buf = _tt_buf self.batch_source_buffer = [] self.batch_target_buffer = [] self.batch_data_idx = 0 self.batch_size = batch_size while self.batch_data_idx < len(ss_buf): self.batch_source_buffer.append( padding_batch_data(ss_buf[self.batch_data_idx: self.batch_data_idx + batch_size], vocab_source.eos_id)) self.batch_target_buffer.append( padding_batch_data(tt_buf[self.batch_data_idx: self.batch_data_idx + batch_size], vocab_target.eos_id)) self.batch_data_idx += batch_size self.reset()
def load(model_dir): """ Loads model configurations. Args: model_dir: A string, the directory. Returns: A dict. """ model_config_filename = os.path.join(model_dir, Constants.MODEL_CONFIG_YAML_FILENAME) if not gfile.Exists(model_config_filename): raise OSError("Fail to find model config file: %s" % model_config_filename) with open_file(model_config_filename, mode="r") as file: model_configs = yaml.load(file) return model_configs
def multi_bleu_score_from_file(hypothesis_file, references_files, char_level=False): """ Computes corpus-level BLEU from hypothesis file and reference file(s). Args: hypothesis_file: A string. references_files: A string. The name of reference file or the prefix. char_level: Whether evaluate at char-level (for Chinese only). Returns: A float. """ with open_file(hypothesis_file) as fp: hypothesis = fp.readlines() references = [] for ref_file in access_multiple_files(references_files): with open_file(ref_file) as fp: if char_level: references.append((to_chinese_char(fp.readlines()))) else: references.append(fp.readlines()) references = list(map(list, zip(*references))) return multi_bleu_score(hypothesis, references)
def __init__(self, parent): """ Initializes. Args: parent: A `ParallelTextInputter` object. """ self._parent = parent self._features_file = self._parent._features_file self._labels_file = self._parent._labels_file if not gfile.Exists(self._labels_file): self._labels_file = self._labels_file + "0" if self._parent._shuffle_every_epoch: self._shuffle_features_file = self._features_file.strip().split("/")[-1] \ + "." + self._parent._shuffle_every_epoch self._shuffle_labels_file = self._labels_file.strip().split("/")[-1] \ + "." + self._parent._shuffle_every_epoch self._shuffle() self._features = open_file(self._features_file, encoding="utf-8") self._labels = open_file(self._labels_file, encoding="utf-8") self._features_buffer = [] self._labels_buffer = [] self._features_len_buffer = [] self._labels_len_buffer = [] self._end_of_data = False
def multi_bleu_score_from_file( hypothesis_file, references_files, char_level=False): """ Computes corpus-level BLEU from hypothesis file and reference file(s). Args: hypothesis_file: A string. references_files: A string. The name of reference file or the prefix. char_level: Whether evaluate at char-level (for Chinese only). Returns: A float. """ with open_file(hypothesis_file) as fp: hypothesis = fp.readlines() references = [] for ref_file in get_labels_files(references_files): with open_file(ref_file) as fp: if char_level: references.append((to_chinese_char(fp.readlines()))) else: references.append(fp.readlines()) references = list(map(list, zip(*references))) return multi_bleu_score(hypothesis, references)
def _make_feeding_data_from(self, filename, maximum_line_length=None, maximum_encoded_length=None): """ Processes the data file and return an iterable instance for loop. Args: filename: A specific data file. maximum_line_length: The maximum sequence length. If provided, sentences exceeding this value will be ignore. maximum_encoded_length: The maximum length of symbols (especially after BPE is applied). If provided symbols of one sentence exceeding this value will be ignore. Returns: An iterable instance that packs feeding dictionary for `tf.Session().run` according to the `filename`. """ features = open_file(filename, encoding="utf-8") str_buf = [] ss_buf = [] for ss in features: if maximum_line_length and len( ss.strip().split()) > maximum_line_length: continue encoded_ss = self._vocab.convert_to_idlist(ss.strip().split()) if maximum_encoded_length and len( encoded_ss) - 1 > maximum_encoded_length: continue bpe_ss = self._vocab.bpe_encode(ss.strip()) str_buf.append(bpe_ss) ss_buf.append(encoded_ss) close_file(features) data = [] batch_data_idx = 0 while batch_data_idx < len(ss_buf): x, len_x = padding_batch_data( ss_buf[batch_data_idx:batch_data_idx + self._batch_size], self._vocab.eos_id) str_x = str_buf[batch_data_idx:batch_data_idx + self._batch_size] batch_data_idx += self._batch_size data.append((str_x, len_x, { self.input_fields[GlobalNames.PH_FEATURE_IDS_NAME]: x, self.input_fields[GlobalNames.PH_FEATURE_LENGTH_NAME]: len_x })) return data
def _make_feeding_data_from(self, filename, maximum_length=None): """ Processes the data file and return an iterable instance for loop. Args: filename: A specific data file. maximum_length: The maximum length of symbols (especially after BPE is applied). If provided symbols of one sentence exceeding this value will be ignore. Returns: An iterable instance that packs feeding dictionary for `tf.Session().run` according to the `filename`. """ features = open_file(filename, encoding="utf-8") ss_buf = [] encoded_ss = read_line_with_filter(features, maximum_length, self._preprocessing_fn) while encoded_ss != "": ss_buf.append(encoded_ss) encoded_ss = read_line_with_filter(features, maximum_length, self._preprocessing_fn) close_file(features) data = [] batch_data_idx = 0 while batch_data_idx < len(ss_buf): x, len_x = padding_batch_data( ss_buf[batch_data_idx:batch_data_idx + self._batch_size], self._padding) batch_data_idx += self._batch_size if "features" in self._data_field_name: data.append({ "feature_ids": x, "feed_dict": { self.input_fields[Constants.FEATURE_IDS_NAME]: x, self.input_fields[Constants.FEATURE_LENGTH_NAME]: len_x } }) else: data.append({ "label_ids": x, "feed_dict": { self.input_fields[Constants.LABEL_IDS_NAME]: x, self.input_fields[Constants.LABEL_LENGTH_NAME]: len_x } }) return data
def _make_feeding_data_from(self, filename, input_fields, maximum_length=None): """ Processes the data file and return an iterable instance for loop. Args: filename: A specific data file. input_fields: A dict of placeholders. maximum_length: The maximum length of symbols (especially after BPE is applied). If provided symbols of one sentence exceeding this value will be ignore. Returns: An iterable instance that packs feeding dictionary for `tf.Session().run` according to the `filename`. """ features = open_file(filename, encoding="utf-8") ss_buf = [] encoded_ss = read_line_with_filter(features, maximum_length, self._preprocessing_fn) while encoded_ss != "": ss_buf.append(encoded_ss) encoded_ss = read_line_with_filter(features, maximum_length, self._preprocessing_fn) close_file(features) data = [] batch_data_idx = 0 name_prefix = Constants.FEATURE_NAME_PREFIX \ if "features" in self._data_field_name else Constants.LABEL_NAME_PREFIX while batch_data_idx < len(ss_buf): data.append( pack_feed_dict( name_prefixs=name_prefix, origin_datas=ss_buf[batch_data_idx:batch_data_idx + self._batch_size], paddings=self._padding, input_fields=input_fields)) batch_data_idx += self._batch_size return data
def __init__(self, data, maximum_length=None, preprocessing_fn=None): """ Initializes the parameters for LineReader. Args: data: A string indicating the name of data file or a list of data list. maximum_length: An integer, the maximum length of one line (after preprocessed if `preprocessing_fn` is provided). preprocessing_fn: A callable function. """ self._maximum_length = maximum_length self._preprocessing_fn = preprocessing_fn self._data_index = 0 if isinstance(data, six.string_types): self._filename = access_multiple_files(data)[0] self._data = open_file(self._filename, encoding="utf-8", mode="r") elif isinstance(data, list): self._filename = None self._data = numpy.array(data) else: raise ValueError("Unrecognized type of `data`: {}, " "which should be string or list".format( type(data)))
def _SmallParallelData(self, features_file, labels_file, maximum_features_length=None, maximum_labels_length=None, maximum_encoded_features_length=None, maximum_encoded_labels_length=None): """ Function for reading small scale parallel data. Args: features_file: The path of features file. labels_file: The path of labels file. maximum_features_length: The maximum sequence length of "features" field. If provided, sentences exceeding this value will be ignore. maximum_labels_length: The maximum sequence length of "labels" field. If provided, sentences exceeding this value will be ignore. maximum_encoded_features_length: The maximum length of feature symbols (especially after BPE is applied) . If provided, the number of symbols of one sentence exceeding this value will be ignore. maximum_encoded_labels_length: The maximum length of label symbols (especially after BPE is applied) . If provided, the number of symbols of one sentence exceeding this value will be ignore. Returns: A list of feeding data. """ eval_features = open_file(features_file, encoding="utf-8") if gfile.Exists(labels_file): eval_labels = open_file(labels_file, encoding="utf-8") else: eval_labels = open_file(labels_file + "0", encoding="utf-8") ss_buf = [] tt_buf = [] for ss, tt in zip(eval_features, eval_labels): if maximum_features_length and len(ss.strip().split()) > maximum_features_length: continue if maximum_labels_length and len(tt.strip().split()) > maximum_labels_length: continue encoded_ss = self._vocab_source.convert_to_idlist(ss.strip().split(" ")) if maximum_encoded_features_length and len(encoded_ss) - 1 > maximum_encoded_features_length: continue encoded_tt = self._vocab_target.convert_to_idlist(tt.strip().split(" ")) if maximum_encoded_labels_length and len(encoded_tt) - 1 > maximum_encoded_labels_length: continue ss_buf.append(encoded_ss) tt_buf.append(encoded_tt) close_file(eval_features) close_file(eval_labels) if self._bucketing: tlen = numpy.array([len(t) for t in tt_buf]) tidx = tlen.argsort() _ss_buf = [ss_buf[i] for i in tidx] _tt_buf = [tt_buf[i] for i in tidx] ss_buf = _ss_buf tt_buf = _tt_buf data = [] batch_data_idx = 0 while batch_data_idx < len(ss_buf): x, len_x = padding_batch_data( ss_buf[batch_data_idx: batch_data_idx + self._batch_size], self._vocab_source.eos_id) y, len_y = padding_batch_data( tt_buf[batch_data_idx: batch_data_idx + self._batch_size], self._vocab_target.eos_id) batch_data_idx += self._batch_size data.append((len(len_x), { self.input_fields[GlobalNames.PH_FEATURE_IDS_NAME]: x, self.input_fields[GlobalNames.PH_FEATURE_LENGTH_NAME]: len_x, self.input_fields[GlobalNames.PH_LABEL_IDS_NAME]: y, self.input_fields[GlobalNames.PH_LABEL_LENGTH_NAME]: len_y})) return data
def infer( sess, prediction_op, infer_data, output, vocab_source, vocab_target, delimiter=" ", output_attention=False, to_char_level=False, verbose=True): """ Infers data and save the prediction results. Args: sess: `tf.Session`. prediction_op: Tensorflow operation for inference. infer_data: An iterable instance that each element is a packed feeding dictionary for `sess`. output: Output file name, `str`. vocab_source: A `Vocab` instance for source side feature map. vocab_target: A `Vocab` instance for target side feature map. alpha: A scalar number, length penalty rate. If not provided or < 0, simply average each beam by length of predicted sequence. delimiter: The delimiter of output token sequence. output_attention: Whether to output attention information. to_char_level: Whether to split words into characters (only for Chinese). verbose: Print inference information if set True. Returns: A tuple `(sources, hypothesis)`, two lists of strings. """ attentions = dict() hypothesis = [] scores = [] sources = [] cnt = 0 for data in infer_data: source_tokens = [vocab_source.convert_to_wordlist( x, bpe_decoding=False, reverse_seq=False) for x in data["feature_ids"]] x_str = [delimiter.join(x) for x in source_tokens] prediction, score, att = _infer( sess=sess, feed_dict=data["feed_dict"], prediction_op=prediction_op, batch_size=len(x_str), top_k=1, output_attention=output_attention) sources.extend(x_str) scores.append(score) hypothesis.extend([delimiter.join(vocab_target.convert_to_wordlist(prediction[sample_idx])) for sample_idx in range(len(prediction))]) if output_attention and att is not None: candidate_tokens = [vocab_target.convert_to_wordlist( prediction[idx], bpe_decoding=False, reverse_seq=False) for idx in range(len(x_str))] attentions.update(pack_batch_attention_dict( cnt, source_tokens, candidate_tokens, att)) cnt += len(x_str) if verbose: tf.logging.info(cnt) if to_char_level: hypothesis = to_chinese_char(hypothesis) if output: with open_file(output, mode="w") as fw: fw.write("\n".join(hypothesis) + "\n") if output_attention: dump_attentions(output, attentions) return sources, hypothesis, numpy.concatenate(scores, axis=0)