def align(self, output=None, align_encoder_id=0, reverse=False, **kwargs): if len(self.filenames.test) != len(self.extensions): raise Exception('wrong number of input files') binary = self.binary and any(self.binary) paths = self.filenames.test or [None] lines = utils.read_lines(paths, binary=self.binary) for line_id, lines in enumerate(lines): token_ids = [ sentence if vocab is None else utils.sentence_to_token_ids(sentence, vocab.vocab, character_level=self.character_level.get(ext)) for ext, vocab, sentence in zip(self.extensions, self.vocabs, lines) ] _, weights = self.seq2seq_model.step(data=[token_ids], align=True, update_model=False) trg_vocab = self.trg_vocab[0] trg_token_ids = token_ids[len(self.src_ext)] trg_tokens = [trg_vocab.reverse[i] if i < len(trg_vocab.reverse) else utils._UNK for i in trg_token_ids] weights = weights.squeeze() max_len = weights.shape[1] if binary: src_tokens = None else: src_tokens = lines[align_encoder_id].split()[:max_len - 1] + [utils._EOS] trg_tokens = trg_tokens[:weights.shape[0] - 1] + [utils._EOS] output_file = output and '{}.{}.pdf'.format(output, line_id + 1) utils.heatmap(src_tokens, trg_tokens, weights, output_file=output_file, reverse=reverse)
def decode(self, output=None, remove_unk=False, raw_output=False, max_test_size=None, **kwargs): utils.log('starting decoding') # empty `test` means that we read from standard input, which is not possible with multiple encoders # assert len(self.src_ext) == 1 or self.filenames.test # check that there is the right number of files for decoding # assert not self.filenames.test or len(self.filenames.test) == len(self.src_ext) output_file = None try: output_file = sys.stdout if output is None else open(output, 'w') paths = self.filenames.test or [None] lines = utils.read_lines(paths, binary=self.binary) if max_test_size: lines = itertools.islice(lines, max_test_size) if not self.filenames.test: # interactive mode batch_size = 1 else: batch_size = self.batch_size lines = list(lines) hypothesis_iter = self.decode_batch(lines, batch_size, remove_unk=remove_unk) for hypothesis, raw in hypothesis_iter: if raw_output: hypothesis = raw output_file.write(hypothesis + '\n') output_file.flush() finally: if output_file is not None: output_file.close()
def align(self, sess, output=None, wav_files=None, **kwargs): if len(self.src_ext) != 1: raise NotImplementedError if len(self.filenames.test) != len(self.extensions): raise Exception('wrong number of input files') for line_id, lines in enumerate( utils.read_lines(self.filenames.test, self.extensions, self.binary_input)): token_ids = [ utils.sentence_to_token_ids( sentence, vocab.vocab, character_level=char_level) if vocab is not None else sentence for vocab, sentence, char_level in zip(self.vocabs, lines, self.character_level) ] _, weights = self.seq2seq_model.step(sess, data=[token_ids], forward_only=True, align=True, update_model=False) trg_tokens = [ self.trg_vocab.reverse[i] if i < len(self.trg_vocab.reverse) else utils._UNK for i in token_ids[-1] ] weights = weights.squeeze()[:len(trg_tokens), :len(token_ids[0])].T max_len = weights.shape[0] if self.binary_input[0]: src_tokens = None else: src_tokens = lines[0].split()[:max_len] if wav_files is not None: wav_file = wav_files[line_id] else: wav_file = None output_file = '{}.{}.svg'.format(output, line_id + 1) if output is not None else None utils.heatmap(src_tokens, trg_tokens, weights.T, wav_file=wav_file, output_file=output_file)
def align(self, sess, output=None, align_encoder_id=0, **kwargs): if self.binary and any(self.binary): raise NotImplementedError if len(self.filenames.test) != len(self.extensions): raise Exception('wrong number of input files') for line_id, lines in enumerate(utils.read_lines(self.filenames.test)): token_ids = [ sentence if vocab is None else utils.sentence_to_token_ids( sentence, vocab.vocab, character_level=self.character_level.get(ext)) for ext, vocab, sentence in zip(self.extensions, self.vocabs, lines) ] _, weights = self.seq2seq_model.step(sess, data=[token_ids], forward_only=True, align=True, update_model=False) trg_vocab = self.trg_vocab[0] # FIXME trg_token_ids = token_ids[len(self.src_ext)] trg_tokens = [ trg_vocab.reverse[i] if i < len(trg_vocab.reverse) else utils._UNK for i in trg_token_ids ] weights = weights.squeeze() max_len = weights.shape[1] utils.debug(weights) trg_tokens.append(utils._EOS) src_tokens = lines[align_encoder_id].split()[:max_len - 1] + [utils._EOS] output_file = '{}.{}.svg'.format(output, line_id + 1) if output is not None else None utils.heatmap(src_tokens, trg_tokens, weights, output_file=output_file)
def decode(self, sess, beam_size, output=None, remove_unk=False, early_stopping=True, use_edits=False, **kwargs): utils.log('starting decoding') # empty `test` means that we read from standard input, which is not possible with multiple encoders assert len(self.src_ext) == 1 or self.filenames.test # we can't read binary data from standard input assert self.filenames.test or self.src_ext[0] not in self.binary_input # check that there is the right number of files for decoding assert not self.filenames.test or len(self.filenames.test) == len( self.src_ext) output_file = None try: output_file = sys.stdout if output is None else open(output, 'w') lines = utils.read_lines(self.filenames.test, self.src_ext, self.binary_input) if self.filenames.test is None: # interactive mode batch_size = 1 else: batch_size = self.batch_size lines = list(lines) hypothesis_iter = self._decode_batch(sess, lines, batch_size, beam_size=beam_size, early_stopping=early_stopping, remove_unk=remove_unk, use_edits=use_edits) for hypothesis in hypothesis_iter: output_file.write(hypothesis + '\n') output_file.flush() finally: if output_file is not None: output_file.close()
def decode(self, sess, beam_size, output=None, remove_unk=False, **kwargs): utils.log('starting decoding') # empty `test` means that we read from standard input, which is not possible with multiple encoders assert len(self.src_ext) == 1 or self.filenames.test # we can't read binary data from standard input assert self.filenames.test or self.src_ext[0] not in self.binary_input # check that there is the right number of files for decoding assert not self.filenames.test or len(self.filenames.test) == len( self.src_ext) output_file = None try: output_file = sys.stdout if output is None else open(output, 'w') for lines in utils.read_lines(self.filenames.test, self.src_ext, self.binary_input): trg_sentence = self._decode_sentence(sess, lines, beam_size, remove_unk) output_file.write(trg_sentence + '\n') output_file.flush() finally: if output_file is not None: output_file.close()
def evaluate(self, sess, beam_size, score_function, on_dev=True, output=None, remove_unk=False, max_dev_size=None, script_dir='scripts', early_stopping=True, use_edits=False, **kwargs): """ :param score_function: name of the scoring function used to score and rank models (typically 'bleu_score') :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus :param output: save the hypotheses to this file :param remove_unk: remove the UNK symbols from the output :param max_dev_size: maximum number of lines to read from dev files :param script_dir: parameter of scoring functions :return: scores of each corpus to evaluate """ utils.log('starting decoding') assert on_dev or len(self.filenames.test) == len(self.extensions) filenames = self.filenames.dev if on_dev else [self.filenames.test] # convert `output` into a list, for zip if isinstance(output, str): output = [output] elif output is None: output = [None] * len(filenames) scores = [] for filenames_, output_ in zip( filenames, output): # evaluation on multiple corpora lines = list( utils.read_lines(filenames_, self.extensions, self.binary_input)) if on_dev and max_dev_size: lines = lines[:max_dev_size] hypotheses = [] references = [] output_file = None try: if output_ is not None: output_file = open(output_, 'w') *src_sentences, trg_sentences = zip(*lines) src_sentences = list(zip(*src_sentences)) hypothesis_iter = self._decode_batch( sess, src_sentences, self.batch_size, beam_size=beam_size, early_stopping=early_stopping, remove_unk=remove_unk, use_edits=use_edits) for sources, hypothesis, reference in zip( src_sentences, hypothesis_iter, trg_sentences): if use_edits: reference = utils.reverse_edits(sources[0], reference) hypotheses.append(hypothesis) references.append(reference.strip().replace('@@ ', '')) if output_file is not None: output_file.write(hypothesis + '\n') output_file.flush() finally: if output_file is not None: output_file.close() # default scoring function is utils.bleu_score score, score_summary = getattr(evaluation, score_function)( hypotheses, references, script_dir=script_dir) # print the scoring information score_info = [] if self.name is not None: score_info.append(self.name) score_info.append('score={:.2f}'.format(score)) if score_summary: score_info.append(score_summary) utils.log(' '.join(map(str, score_info))) scores.append(score) return scores
def evaluate(self, score_functions, on_dev=True, output=None, remove_unk=False, max_dev_size=None, raw_output=False, fix_edits=True, max_test_size=None, post_process_script=None, unk_replace=False, **kwargs): """ Decode a dev or test set, and perform evaluation with respect to gold standard, using the provided scoring function. If `output` is defined, also save the decoding output to this file. When evaluating development data (`on_dev` to True), several dev sets can be specified (`dev_prefix` parameter in configuration files), and a score is computed for each of them. :param score_function: name of the scoring function used to score and rank models (typically 'bleu_score') :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus :param output: save the hypotheses to this file :param remove_unk: remove the UNK symbols from the output :param max_dev_size: maximum number of lines to read from dev files :param max_test_size: maximum number of lines to read from test files :param raw_output: save raw decoder output (don't do post-processing like UNK deletion or subword concatenation). The evaluation is still done with the post-processed output. :param fix_edits: when predicting edit operations, pad shorter hypotheses with KEEP symbols. :return: scores of each corpus to evaluate """ utils.log('starting evaluation') if on_dev: filenames = self.filenames.dev else: filenames = [self.filenames.test] # convert `output` into a list, for zip if isinstance(output, str): output = [output] elif output is None: output = [None] * len(filenames) scores = [] # evaluation on multiple corpora for dev_id, (filenames_, output_, prefix) in enumerate( zip(filenames, output, self.dev_prefix)): if self.ref_ext is not None: filenames_ = filenames_[:len(self.src_ext)] + filenames_[-1:] if self.dev_batches: dev_batches = self.dev_batches[dev_id] dev_loss = sum( self.seq2seq_model.step(batch, update_model=False).loss * len(batch) for batch in dev_batches) dev_loss /= sum(map(len, dev_batches)) else: # TODO dev_loss = 0 src_lines = list( utils.read_lines(filenames_[:len(self.src_ext)], binary=self.binary[:len(self.src_ext)])) trg_lines = list(utils.read_lines([filenames_[len(self.src_ext)]])) assert len(trg_lines) % len(src_lines) == 0 references = [] ref_count = len(trg_lines) // len(src_lines) for i in range(len(src_lines)): ref = trg_lines[i * ref_count:(i + 1) * ref_count] ref = [ ref_[0].strip().replace('@@ ', '').replace('@@', '') for ref_ in ref ] references.append(ref) if on_dev and max_dev_size: max_size = max_dev_size elif not on_dev and max_test_size: max_size = max_test_size else: max_size = len(src_lines) src_lines = src_lines[:max_size] references = references[:max_size] hypotheses = [] output_file = None try: if output_ is not None: output_file = open(output_, 'w') hypothesis_iter = self.decode_batch(src_lines, self.batch_size, remove_unk=remove_unk, fix_edits=fix_edits, unk_replace=unk_replace) if post_process_script is not None: hypotheses, raw = zip(*hypothesis_iter) data = '\n'.join(hypotheses).encode() data = Popen( [post_process_script], stdout=PIPE, stdin=PIPE).communicate(input=data)[0].decode() hypotheses = data.splitlines() hypothesis_iter = zip(hypotheses, raw) for i, hypothesis in enumerate(hypothesis_iter): hypothesis, raw = hypothesis hypotheses.append(hypothesis) if output_file is not None: if raw_output: hypothesis = raw output_file.write(hypothesis + '\n') output_file.flush() finally: if output_file is not None: output_file.close() scores_ = [] summary = None for score_function in score_functions: try: if score_function != 'bleu': references_ = [ref[0] for ref in references] else: references_ = references if score_function == 'loss': score = dev_loss reversed_ = True else: fun = getattr(evaluation, 'corpus_' + score_function) try: reversed_ = fun.reversed except AttributeError: reversed_ = False score, score_summary = fun(hypotheses, references_) summary = summary or score_summary scores_.append((score_function, score, reversed_)) except: pass score_info = [ '{}={:.2f}'.format(key, value) for key, value, _ in scores_ ] score_info.insert(0, prefix) if summary: score_info.append(summary) if self.name is not None: score_info.insert(0, self.name) utils.log(' '.join(map(str, score_info))) # main score _, score, reversed_ = scores_[0] scores.append(-score if reversed_ else score) return scores
def evaluate(self, score_function, on_dev=True, output=None, remove_unk=False, max_dev_size=None, raw_output=False, fix_edits=True, max_test_size=None, post_process_script=None, **kwargs): """ Decode a dev or test set, and perform evaluation with respect to gold standard, using the provided scoring function. If `output` is defined, also save the decoding output to this file. When evaluating development data (`on_dev` to True), several dev sets can be specified (`dev_prefix` parameter in configuration files), and a score is computed for each of them. :param score_function: name of the scoring function used to score and rank models (typically 'bleu_score') :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus :param output: save the hypotheses to this file :param remove_unk: remove the UNK symbols from the output :param max_dev_size: maximum number of lines to read from dev files :param max_test_size: maximum number of lines to read from test files :param raw_output: save raw decoder output (don't do post-processing like UNK deletion or subword concatenation). The evaluation is still done with the post-processed output. :param fix_edits: when predicting edit operations, pad shorter hypotheses with KEEP symbols. :return: scores of each corpus to evaluate """ utils.log('starting decoding') if on_dev: filenames = self.filenames.dev else: filenames = [self.filenames.test] # convert `output` into a list, for zip if isinstance(output, str): output = [output] elif output is None: output = [None] * len(filenames) scores = [] for filenames_, output_, prefix in zip( filenames, output, self.dev_prefix): # evaluation on multiple corpora extensions = list(self.extensions) if self.ref_ext is not None: extensions.append(self.ref_ext) lines = list(utils.read_lines(filenames_, binary=self.binary)) if on_dev and max_dev_size: lines = lines[:max_dev_size] elif not on_dev and max_test_size: lines = lines[:max_test_size] hypotheses = [] references = [] output_file = None try: if output_ is not None: output_file = open(output_, 'w') lines_ = list(zip(*lines)) src_sentences = list(zip(*lines_[:len(self.src_ext)])) trg_sentences = list(zip(*lines_[len(self.src_ext):])) hypothesis_iter = self.decode_batch(lines, self.batch_size, remove_unk=remove_unk, fix_edits=fix_edits) for i, (sources, hypothesis, reference) in enumerate( zip(src_sentences, hypothesis_iter, trg_sentences)): if self.ref_ext is not None and on_dev: reference = reference[-1] else: reference = reference[0] # single output for now hypothesis, raw = hypothesis hypotheses.append(hypothesis) references.append(reference.strip().replace('@@ ', '')) if output_file is not None: if raw_output: hypothesis = raw output_file.write(hypothesis + '\n') output_file.flush() finally: if output_file is not None: output_file.close() if post_process_script is not None: data = '\n'.join(hypotheses).encode() data = Popen([post_process_script], stdout=PIPE, stdin=PIPE).communicate(input=data)[0].decode() hypotheses = data.splitlines() # default scoring function is utils.bleu_score score, score_summary = getattr(evaluation, score_function)(hypotheses, references) # print scoring information score_info = [prefix, 'score={:.2f}'.format(score)] if score_summary: score_info.append(score_summary) if self.name is not None: score_info.insert(0, self.name) utils.log(' '.join(map(str, score_info))) scores.append(score) return scores
def evaluate(self, score_functions, on_dev=True, output=None, remove_unk=False, max_dev_size=None, raw_output=False, fix_edits=True, max_test_size=None, post_process_script=None, unk_replace=False, **kwargs): """ Decode a dev or test set, and perform evaluation with respect to gold standard, using the provided scoring function. If `output` is defined, also save the decoding output to this file. When evaluating development data (`on_dev` to True), several dev sets can be specified (`dev_prefix` parameter in configuration files), and a score is computed for each of them. :param score_function: name of the scoring function used to score and rank models (typically 'bleu_score') :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus :param output: save the hypotheses to this file :param remove_unk: remove the UNK symbols from the output :param max_dev_size: maximum number of lines to read from dev files :param max_test_size: maximum number of lines to read from test files :param raw_output: save raw decoder output (don't do post-processing like UNK deletion or subword concatenation). The evaluation is still done with the post-processed output. :param fix_edits: when predicting edit operations, pad shorter hypotheses with KEEP symbols. :return: scores of each corpus to evaluate """ utils.log('starting evaluation') if on_dev: filenames = self.filenames.dev else: filenames = [self.filenames.test] # convert `output` into a list, for zip if isinstance(output, str): output = [output] elif output is None: output = [None] * len(filenames) scores = [] utils.log('show output') utils.log(output) # evaluation on multiple corpora for dev_id, (filenames_, output_, prefix) in enumerate(zip(filenames, output, self.dev_prefix)): utils.log('filenames, output, self.dev_prefix') utils.log(filenames) utils.log(output) if self.dev_batches: dev_batches = self.dev_batches[dev_id] dev_loss = sum(self.seq2seq_model.step(batch, update_model=False).loss * len(batch) for batch in dev_batches) dev_loss /= sum(map(len, dev_batches)) else: # TODO dev_loss = 0 extensions = list(self.extensions) if self.ref_ext is not None: extensions.append(self.ref_ext) lines = list(utils.read_lines(filenames_, binary=self.binary)) if on_dev and max_dev_size: lines = lines[:max_dev_size] elif not on_dev and max_test_size: lines = lines[:max_test_size] hypotheses = [] references = [] utils.log("making hypotheses") output_file = None try: if output_ is not None: output_file = open(output_, 'w', encoding='utf-8') lines_ = list(zip(*lines)) src_sentences = list(zip(*lines_[:len(self.src_ext)])) trg_sentences = list(zip(*lines_[len(self.src_ext):])) utils.log("making decode_batch") hypothesis_iter = self.decode_batch(lines, self.batch_size, remove_unk=remove_unk, fix_edits=fix_edits, unk_replace=unk_replace) for i, (sources, hypothesis, reference) in enumerate(zip(src_sentences, hypothesis_iter, trg_sentences)): if self.ref_ext is not None and on_dev: reference = reference[-1] else: reference = reference[0] # single output for now hypothesis, raw = hypothesis # hypothesis: [10items],each item is a "token sequence" hypotheses.append(hypothesis) references.append(reference.strip().replace('@@ ', '')) if output_file is not None: if raw_output: hypothesis = raw line = "source:\t" + str(sources) + "\nref:\t" + str(reference) + "\n" for item in hypothesis: line += str(item) + '\n' line += "\n" # line = hypothesis + '\n' output_file.write(line) output_file.flush() finally: if output_file is not None: output_file.close() if post_process_script is not None: data = '\n'.join(hypotheses).encode() data = Popen([post_process_script], stdout=PIPE, stdin=PIPE).communicate(input=data)[0].decode() hypotheses = data.splitlines() scores_ = [] summary = None for score_function in score_functions: try: if score_function == 'loss': score = dev_loss reversed_ = True else: fun = getattr(evaluation, 'corpus_' + score_function) try: reversed_ = fun.reversed except AttributeError: reversed_ = False func_arg = [] for item in hypotheses: func_arg.append(item[0]) score, score_summary = fun(func_arg, references) summary = summary or score_summary scores_.append((score_function, score, reversed_)) except: pass score_info = ['{}={:.2f}'.format(key, value) for key, value, _ in scores_] score_info.insert(0, prefix) if summary: score_info.append(summary) if self.name is not None: score_info.insert(0, self.name) utils.log(' '.join(map(str, score_info))) # main score _, score, reversed_ = scores_[0] scores.append(-score if reversed_ else score) return scores
def evaluate(self, sess, beam_size, score_function, on_dev=True, output=None, remove_unk=False, auxiliary_score_function=None, script_dir='scripts', **kwargs): """ :param score_function: name of the scoring function used to score and rank models (typically 'bleu_score') :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus :param output: save the hypotheses to this file :param remove_unk: remove the UNK symbols from the output :param auxiliary_score_function: optional scoring function used to display a more detailed summary. :param script_dir: parameter of scoring functions :return: scores of each corpus to evaluate """ utils.log('starting decoding') assert on_dev or len(self.filenames.test) == len(self.extensions) filenames = self.filenames.dev if on_dev else [self.filenames.test] # convert `output` into a list, for zip if isinstance(output, str): output = [output] elif output is None: output = [None] * len(filenames) scores = [] for filenames_, output_ in zip( filenames, output): # evaluation on multiple corpora lines = list( utils.read_lines(filenames_, self.extensions, self.binary_input)) hypotheses = [] references = [] try: output_file = open(output_, 'w') if output_ is not None else None for *src_sentences, trg_sentence in lines: hypotheses.append( self._decode_sentence(sess, src_sentences, beam_size, remove_unk)) references.append(trg_sentence.strip().replace('@@ ', '')) if output_file is not None: output_file.write(hypotheses[-1] + '\n') output_file.flush() finally: if output_file is not None: output_file.close() # main scoring function (used to choose which checkpoints to keep) # default is utils.bleu_score score, score_summary = getattr(utils, score_function)(hypotheses, references, script_dir) # optionally use an auxiliary function to get different scoring information if auxiliary_score_function is not None and auxiliary_score_function != score_function: try: _, score_summary = getattr( utils, auxiliary_score_function)(hypotheses, references, script_dir) except: pass # print the scoring information score_info = [] if self.name is not None: score_info.append(self.name) score_info.append('score={}'.format(score)) if score_summary: score_info.append(score_summary) utils.log(' '.join(map(str, score_info))) scores.append(score) return scores