def on_end_inference(self): if self.hyp_sents: ref_filename = f"{self.report_path}/tmp/compare-mt.ref" out_filename = f"{self.report_path}/tmp/compare-mt.out" util.make_parent_dir(out_filename) with open(ref_filename, "w") as fout: for l in self.ref_sents: fout.write(f"{l.strip()}\n") with open(out_filename, "w") as fout: for l in self.hyp_sents: fout.write(f"{l.strip()}\n") import xnmt.thirdparty.comparemt.compare_mt as compare_mt args = util.ArgClass(ref_file=ref_filename, out_file=out_filename, out2_file=self.out2_file, train_file=self.train_file, train_counts=self.train_counts, alpha=self.alpha, ngram=self.ngram, ngram_size=self.ngram_size, sent_size=self.sent_size) out_lines = compare_mt.main(args) report_filename = f"{self.report_path}/compare-mt.txt" util.make_parent_dir(report_filename) with open(report_filename, "w") as fout: for l in out_lines: fout.write(f"{l}\n") self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
def perform_inference(self, generator: 'model_base.GeneratorModel', src_file: str = None, trg_file: str = None, ref_file_to_report=None): """ Perform inference. Args: generator: the model to be used src_file: path of input src file to be translated trg_file: path of file where trg translatons will be written """ src_file = src_file or self.src_file trg_file = trg_file or self.trg_file util.make_parent_dir(trg_file) logger.info(f'Performing inference on {src_file}') ref_corpus, src_corpus = self._read_corpus(generator, src_file, mode=self.mode, ref_file=self.ref_file) generator.set_train(False) ref_scores = None if self.mode == 'score': ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents) self._write_rescored_output(ref_scores, self.ref_file, trg_file) if self.mode == 'forceddebug': ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents) if self.mode != 'score': self._generate_output(generator=generator, forced_ref_corpus=ref_corpus, assert_scores=ref_scores, src_corpus=src_corpus, trg_file=trg_file, batcher=self.batcher, max_src_len=self.max_src_len, ref_file_to_report=ref_file_to_report) self.end_inference()
def write_html(self) -> None: html_str = "\n".join(self.html_contents) soup = bs(html_str, "lxml") pretty_html = soup.prettify() html_file_name = f"{self.report_path}/{self.report_name}.html" util.make_parent_dir(html_file_name) with open(html_file_name, 'w', encoding='utf-8') as f: f.write(pretty_html)
def on_end_inference(self): if self.hyp_sents: html_filename = f"{self.report_path}/charcut.html" util.make_parent_dir(html_filename) args = util.ArgClass(html_output_file=html_filename, match_size=self.match_size, alt_norm=self.alt_norm) aligned_segs = charcut.load_input_segs(cand_segs=self.hyp_sents, ref_segs=self.ref_sents, src_segs=self.src_sents) charcut.run_on(aligned_segs, args) self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
def set_out_file(out_file): unset_out_file() make_parent_dir(out_file) fh = logging.FileHandler(out_file, mode='w') fh.setLevel(settings.LOG_LEVEL_FILE) fh.setFormatter(MainFormatter()) logger.addHandler(fh) yaml_fh = logging.FileHandler(f"{out_file}.yaml", mode='w') yaml_fh.setLevel(logging.DEBUG) yaml_fh.setFormatter(YamlFormatter()) yaml_fh.setLevel(logging.DEBUG) yaml_logger.addHandler(yaml_fh)
def __init__(self, path, train_files, vocab_size, overwrite=False, model_prefix='sentpiece', output_format='piece', model_type='bpe', encode_extra_options=None, decode_extra_options=None): """ Initialize the wrapper around sentencepiece and train the tokenizer. If overwrite is set to False, learned model will not be overwritten, even if parameters are changed. "File" output for Sentencepiece written to StringIO temporarily before being written to disk. """ self.sentpiece_path = path self.model_prefix = model_prefix self.output_format = output_format self.input_format = output_format self.encode_extra_options = [ '--extra_options=' + encode_extra_options ] if encode_extra_options else [] self.decode_extra_options = [ '--extra_options=' + decode_extra_options ] if decode_extra_options else [] make_parent_dir(model_prefix) if ((not os.path.exists(self.model_prefix + '.model')) or (not os.path.exists(self.model_prefix + '.vocab')) or overwrite): sentpiece_train_exec_loc = os.path.join(path, 'spm_train') sentpiece_train_command = [ sentpiece_train_exec_loc, '--input=' + ','.join(train_files), '--model_prefix=' + str(model_prefix), '--vocab_size=' + str(vocab_size), '--model_type=' + str(model_type) ] subprocess.call(sentpiece_train_command) sentpiece_encode_exec_loc = os.path.join(self.sentpiece_path, 'spm_encode') sentpiece_encode_command = [ sentpiece_encode_exec_loc, '--model=' + self.model_prefix + '.model', '--output_format=' + self.output_format ] + self.encode_extra_options self.tokenizer_command = sentpiece_encode_command
def create_report(self, segment_actions, src_vocab, src, **kwargs): if self.report_fp is None: report_path = self.report_path + "/segment.txt" util.make_parent_dir(report_path) self.report_fp = open(report_path, "w") actions = segment_actions[0][0] src = [src_vocab[x] for x in src] words = [] start = 0 for end in actions: words.append("".join(str(src[start:end + 1]))) start = end + 1 print(" ".join(words), file=self.report_fp)
def plot_attention(src_words, trg_words, attention_matrix, file_name, size_x = 8.0, size_y = 8.0): """This takes in source and target words and an attention matrix (in numpy format) and prints a visualization of this to a file. Args: src_words: a list of words in the source trg_words: a list of target words attention_matrix: a two-dimensional numpy array of values between zero and one, where rows correspond to source words, and columns correspond to target words file_name: the name of the file to which we write the attention size_x: width of the main plot size_y: height of the plot """ trg_words = [unidecode(w) for w in trg_words] src_is_speech = isinstance(src_words, np.ndarray) max_len = len(''.join(trg_words)) if not src_is_speech: max_len = max(max_len, len(''.join(src_words))) src_words = [unidecode(w) for w in src_words] if max_len>150: matplotlib.rc('font', size=5) elif max_len>50: matplotlib.rc('font', size=7) dpi = 100 if max_len <= 150 else 150 fig, axs = plt.subplots(nrows=1, ncols=2 if src_is_speech else 1, figsize=(size_x+(1.0 if src_is_speech else 0.0), size_y), gridspec_kw = {'width_ratios':[1, size_x]} if src_is_speech else None) ax = axs[1] if src_is_speech else axs # put the major ticks at the middle of each cell ax.set_xticks(np.arange(attention_matrix.shape[1]) + 0.5, minor=False) ax.set_yticks(np.arange(attention_matrix.shape[0]) + 0.5, minor=False) ax.invert_yaxis() if src_is_speech: plt.yticks([], []) # label axes by words ax.set_xticklabels(trg_words, minor=False) if not src_is_speech: ax.set_yticklabels(src_words, minor=False) ax.xaxis.tick_top() # draw the heatmap plt.pcolor(attention_matrix, cmap=plt.cm.Blues, vmin=0, vmax=1) plt.colorbar() if src_is_speech: ax = axs[0] plot_speech_features(feature_matrix=src_words, ax=ax, dpi=dpi) fig.tight_layout() util.make_parent_dir(file_name) plt.savefig(file_name, dpi=dpi) plt.close()
def run_preproc_task(self, overwrite=False): tokenizers = { my_opts["filenum"]: [tok for tok in my_opts["tokenizers"]] for my_opts in self.specs } for file_num, (in_file, out_file) in enumerate( zip(self.in_files, self.out_files)): if overwrite or not os.path.isfile(out_file): make_parent_dir(out_file) my_tokenizers = tokenizers.get(file_num, tokenizers["all"]) with open(out_file, "w", encoding='utf-8') as out_stream, \ open(in_file, "r", encoding='utf-8') as in_stream: for tokenizer in my_tokenizers: in_stream = tokenizer.tokenize_stream(in_stream) for line in in_stream: out_stream.write(f"{line}\n")
def run_preproc_task(self, overwrite=False): normalizers = { my_opts["filenum"]: Normalizer.from_spec(my_opts["spec"]) for my_opts in self.specs } for i, (in_file, out_file) in enumerate(zip(self.in_files, self.out_files)): if overwrite or not os.path.isfile(out_file): make_parent_dir(out_file) my_normalizers = normalizers.get(i, normalizers["all"]) with open(out_file, "w", encoding='utf-8') as out_stream, \ open(in_file, "r", encoding='utf-8') as in_stream: for line in in_stream: line = line.strip() for normalizer in my_normalizers: line = normalizer.normalize(line) out_stream.write(line + "\n")
def __init__(self, path, train_files, vocab_size, overwrite=False, model_prefix='sentpiece', output_format='piece', model_type='bpe', hard_vocab_limit=True, encode_extra_options=None, decode_extra_options=None): """ This will initialize and train the sentencepiece tokenizer. If overwrite is set to False, learned model will not be overwritten, even if parameters are changed. "File" output for Sentencepiece written to StringIO temporarily before being written to disk. """ import sentencepiece as spm # TODO: deprecate the path argument self.sentpiece_path = path self.model_prefix = model_prefix self.output_format = output_format self.input_format = output_format self.overwrite = overwrite self.encode_extra_options = [ '--extra_options=' + encode_extra_options ] if encode_extra_options else [] self.decode_extra_options = [ '--extra_options=' + decode_extra_options ] if decode_extra_options else [] make_parent_dir(model_prefix) self.sentpiece_train_args = [ '--input=' + ','.join(train_files), '--model_prefix=' + str(model_prefix), '--vocab_size=' + str(vocab_size), '--hard_vocab_limit=' + str(hard_vocab_limit).lower(), '--model_type=' + str(model_type) ] self.sentpiece_processor = None
def run_preproc_task(self, overwrite=False): filters = { my_opts["filenum"]: VocabFilterer.from_spec(my_opts["spec"]) for my_opts in self.specs } for i, (in_file, out_file) in enumerate(zip(self.in_files, self.out_files)): if overwrite or not os.path.isfile(out_file): make_parent_dir(out_file) with open(out_file, "w", encoding='utf-8') as out_stream, \ open(in_file, "r", encoding='utf-8') as in_stream: vocab = {} for line in in_stream: for word in line.strip().split(): vocab[word] = vocab.get(word, 0) + 1 for my_filter in filters.get(i, filters["all"]): vocab = my_filter.filter(vocab) for word in vocab.keys(): out_stream.write((word + u"\n"))
def set_out_file(out_file): """ Set the file to log to. Before calling this, logs are only passed to stdout/stderr. Args: out_file: file name """ unset_out_file() make_parent_dir(out_file) with open(out_file, mode="w") as f_out: for line in _preamble_content: f_out.write(f"{line}\n") fh = logging.FileHandler(out_file) fh.setLevel(settings.LOG_LEVEL_FILE) fh.setFormatter(MainFormatter()) logger.addHandler(fh) yaml_fh = logging.FileHandler(f"{out_file}.yaml", mode='w') yaml_fh.setLevel(logging.DEBUG) yaml_fh.setFormatter(YamlFormatter()) yaml_fh.setLevel(logging.DEBUG) yaml_logger.addHandler(yaml_fh)
def plot_speech_features(feature_matrix, file_name=None, vertical = True, ax=None, length = 8.0, dpi=100): """Plot speech feature matrix. Args: feature_matrix: a two-dimensional numpy array of values between zero and one, where rows correspond to source words, and columns correspond to target words file_name: the name of the file to which we write the attention; if not given, the plt context will be left un-closed vertical: if True, the time dimension will be projected onto the y axis, otherwise the x axis ax: if given, draw on this matplotlib axis; otherwise create a new figure length: figure length (if ax is not given) """ if not ax: plt.subplots(figsize=(1.0, length)) if vertical: feature_matrix = feature_matrix.T if ax: ax.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1) ax.axis('off') else: plt.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1) plt.axis('off') if file_name is not None: util.make_parent_dir(file_name) plt.savefig(file_name, dpi=dpi) plt.close()
def __call__(self, generator, src_file=None, trg_file=None, candidate_id_file=None): """ Args: generator (GeneratorModel): the model to be used src_file (str): path of input src file to be translated trg_file (str): path of file where trg translatons will be written candidate_id_file (str): if we are doing something like retrieval where we select from fixed candidates, sometimes we want to limit our candidates to a certain subset of the full set. this setting allows us to do this. """ args = dict(src_file=src_file or self.src_file, trg_file=trg_file or self.trg_file, ref_file=self.ref_file, max_src_len=self.max_src_len, post_process=self.post_process, candidate_id_file=candidate_id_file, report_path=self.report_path, report_type=self.report_type, beam=self.beam, max_len=self.max_len, len_norm_type=self.len_norm_type, mode=self.mode) is_reporting = issubclass( generator.__class__, Reportable) and args["report_path"] is not None # Corpus src_corpus = list(generator.src_reader.read_sents(args["src_file"])) # Get reference if it exists and is necessary if args["mode"] == "forced" or args["mode"] == "forceddebug" or args[ "mode"] == "score": if args["ref_file"] == None: raise RuntimeError( "When performing {} decoding, must specify reference file". format(args["mode"])) score_src_corpus = [] ref_corpus = [] with open(args["ref_file"], "r", encoding="utf-8") as fp: for line in fp: if args["mode"] == "score": nbest = line.split("|||") assert len( nbest ) > 1, "When performing scoring, ref_file must have nbest format 'index ||| hypothesis'" src_index = int(nbest[0].strip()) assert src_index < len( src_corpus ), "The src_file has only {} instances, nbest file has invalid src_index {}".format( len(src_corpus), src_index) score_src_corpus.append(src_corpus[src_index]) trg_input = generator.trg_reader.read_sent( nbest[1].strip()) else: trg_input = generator.trg_reader.read_sent(line) ref_corpus.append(trg_input) if args["mode"] == "score": src_corpus = score_src_corpus else: if self.max_len and any( len(s) > self.max_len for s in ref_corpus): logger.warning( "Forced decoding with some targets being longer than max_len. Increase max_len to avoid unexpected behavior." ) else: ref_corpus = None # Vocab src_vocab = generator.src_reader.vocab if hasattr( generator.src_reader, "vocab") else None trg_vocab = generator.trg_reader.vocab if hasattr( generator.trg_reader, "vocab") else None # Perform initialization generator.set_train(False) generator.initialize_generator(**args) if hasattr(generator, "set_post_processor"): generator.set_post_processor(self.get_output_processor()) if hasattr(generator, "set_trg_vocab"): generator.set_trg_vocab(trg_vocab) if hasattr(generator, "set_reporting_src_vocab"): generator.set_reporting_src_vocab(src_vocab) if is_reporting: generator.set_report_resource("src_vocab", src_vocab) generator.set_report_resource("trg_vocab", trg_vocab) # If we're debugging, calculate the loss for each target sentence ref_scores = None if args["mode"] == 'forceddebug' or args["mode"] == 'score': some_batcher = xnmt.batcher.InOrderBatcher(32) # Arbitrary if not isinstance(some_batcher, xnmt.batcher.InOrderBatcher): raise ValueError( f"forceddebug requires InOrderBatcher, got: {some_batcher}" ) batched_src, batched_ref = some_batcher.pack( src_corpus, ref_corpus) ref_scores = [] for src, ref in zip(batched_src, batched_ref): dy.renew_cg(immediate_compute=settings.IMMEDIATE_COMPUTE, check_validity=settings.CHECK_VALIDITY) loss_expr = generator.calc_loss( src, ref, loss_calculator=LossCalculator()) if isinstance(loss_expr.value(), Iterable): ref_scores.extend(loss_expr.value()) else: ref_scores.append(loss_expr.value()) ref_scores = [-x for x in ref_scores] # Make the parent directory if necessary make_parent_dir(args["trg_file"]) # Perform generation of output if args["mode"] != 'score': with open(args["trg_file"], 'wt', encoding='utf-8' ) as fp: # Saving the translated output to a trg file src_ret = [] for i, src in enumerate(src_corpus): # This is necessary when the batcher does some sort of pre-processing, e.g. # when the batcher pads to a particular number of dimensions if self.batcher: self.batcher.add_single_batch(src_curr=[src], trg_curr=None, src_ret=src_ret, trg_ret=None) src = src_ret.pop()[0] # Do the decoding if args["max_src_len"] is not None and len( src) > args["max_src_len"]: output_txt = NO_DECODING_ATTEMPTED else: dy.renew_cg( immediate_compute=settings.IMMEDIATE_COMPUTE, check_validity=settings.CHECK_VALIDITY) ref_ids = ref_corpus[i] if ref_corpus != None else None output = generator.generate_output( src, i, forced_trg_ids=ref_ids) # If debugging forced decoding, make sure it matches if ref_scores != None and ( abs(output[0].score - ref_scores[i]) / abs(ref_scores[i])) > 1e-5: logger.error( f'Forced decoding score {output[0].score} and loss {ref_scores[i]} do not match at sentence {i}' ) output_txt = output[0].plaintext # Printing to trg file fp.write(f"{output_txt}\n") else: with open(args["trg_file"], 'wt', encoding='utf-8') as fp: with open(args["ref_file"], "r", encoding="utf-8") as nbest_fp: for nbest, score in zip(nbest_fp, ref_scores): fp.write("{} ||| score={}\n".format( nbest.strip(), score))
def run_preproc_task(self, overwrite=False): extractor = self.specs for in_file, out_file in zip(self.in_files, self.out_files): if overwrite or not os.path.isfile(out_file): make_parent_dir(out_file) extractor.extract_to(in_file, out_file)
def setUp(self): yaml.add_representer(DummyClass, xnmt.init_representer) self.out_dir = "test/tmp" util.make_parent_dir(f"{self.out_dir}/asdf")