Beispiel #1
0
    def align(self, output=None, align_encoder_id=0, reverse=False, **kwargs):
        if len(self.filenames.test) != len(self.extensions):
            raise Exception('wrong number of input files')

        binary = self.binary and any(self.binary)

        paths = self.filenames.test or [None]
        lines = utils.read_lines(paths, binary=self.binary)

        for line_id, lines in enumerate(lines):
            token_ids = [
                sentence if vocab is None else
                utils.sentence_to_token_ids(sentence, vocab.vocab, character_level=self.character_level.get(ext))
                for ext, vocab, sentence in zip(self.extensions, self.vocabs, lines)
            ]

            _, weights = self.seq2seq_model.step(data=[token_ids], align=True, update_model=False)

            trg_vocab = self.trg_vocab[0]
            trg_token_ids = token_ids[len(self.src_ext)]
            trg_tokens = [trg_vocab.reverse[i] if i < len(trg_vocab.reverse) else utils._UNK for i in trg_token_ids]

            weights = weights.squeeze()
            max_len = weights.shape[1]

            if binary:
                src_tokens = None
            else:
                src_tokens = lines[align_encoder_id].split()[:max_len - 1] + [utils._EOS]
            trg_tokens = trg_tokens[:weights.shape[0] - 1] + [utils._EOS]

            output_file = output and '{}.{}.pdf'.format(output, line_id + 1)

            utils.heatmap(src_tokens, trg_tokens, weights, output_file=output_file, reverse=reverse)
Beispiel #2
0
    def decode(self, output=None, remove_unk=False, raw_output=False, max_test_size=None, **kwargs):
        utils.log('starting decoding')

        # empty `test` means that we read from standard input, which is not possible with multiple encoders
        # assert len(self.src_ext) == 1 or self.filenames.test
        # check that there is the right number of files for decoding
        # assert not self.filenames.test or len(self.filenames.test) == len(self.src_ext)

        output_file = None
        try:
            output_file = sys.stdout if output is None else open(output, 'w')
            paths = self.filenames.test or [None]
            lines = utils.read_lines(paths, binary=self.binary)

            if max_test_size:
                lines = itertools.islice(lines, max_test_size)

            if not self.filenames.test:   # interactive mode
                batch_size = 1
            else:
                batch_size = self.batch_size
                lines = list(lines)

            hypothesis_iter = self.decode_batch(lines, batch_size, remove_unk=remove_unk)

            for hypothesis, raw in hypothesis_iter:
                if raw_output:
                    hypothesis = raw

                output_file.write(hypothesis + '\n')
                output_file.flush()
        finally:
            if output_file is not None:
                output_file.close()
Beispiel #3
0
    def align(self, sess, output=None, wav_files=None, **kwargs):
        if len(self.src_ext) != 1:
            raise NotImplementedError

        if len(self.filenames.test) != len(self.extensions):
            raise Exception('wrong number of input files')

        for line_id, lines in enumerate(
                utils.read_lines(self.filenames.test, self.extensions,
                                 self.binary_input)):
            token_ids = [
                utils.sentence_to_token_ids(
                    sentence, vocab.vocab, character_level=char_level)
                if vocab is not None else sentence for vocab, sentence,
                char_level in zip(self.vocabs, lines, self.character_level)
            ]

            _, weights = self.seq2seq_model.step(sess,
                                                 data=[token_ids],
                                                 forward_only=True,
                                                 align=True,
                                                 update_model=False)
            trg_tokens = [
                self.trg_vocab.reverse[i]
                if i < len(self.trg_vocab.reverse) else utils._UNK
                for i in token_ids[-1]
            ]

            weights = weights.squeeze()[:len(trg_tokens), :len(token_ids[0])].T
            max_len = weights.shape[0]

            if self.binary_input[0]:
                src_tokens = None
            else:
                src_tokens = lines[0].split()[:max_len]

            if wav_files is not None:
                wav_file = wav_files[line_id]
            else:
                wav_file = None

            output_file = '{}.{}.svg'.format(output, line_id +
                                             1) if output is not None else None
            utils.heatmap(src_tokens,
                          trg_tokens,
                          weights.T,
                          wav_file=wav_file,
                          output_file=output_file)
Beispiel #4
0
    def align(self, sess, output=None, align_encoder_id=0, **kwargs):
        if self.binary and any(self.binary):
            raise NotImplementedError

        if len(self.filenames.test) != len(self.extensions):
            raise Exception('wrong number of input files')

        for line_id, lines in enumerate(utils.read_lines(self.filenames.test)):
            token_ids = [
                sentence if vocab is None else utils.sentence_to_token_ids(
                    sentence,
                    vocab.vocab,
                    character_level=self.character_level.get(ext)) for ext,
                vocab, sentence in zip(self.extensions, self.vocabs, lines)
            ]

            _, weights = self.seq2seq_model.step(sess,
                                                 data=[token_ids],
                                                 forward_only=True,
                                                 align=True,
                                                 update_model=False)

            trg_vocab = self.trg_vocab[0]  # FIXME
            trg_token_ids = token_ids[len(self.src_ext)]
            trg_tokens = [
                trg_vocab.reverse[i]
                if i < len(trg_vocab.reverse) else utils._UNK
                for i in trg_token_ids
            ]

            weights = weights.squeeze()
            max_len = weights.shape[1]

            utils.debug(weights)

            trg_tokens.append(utils._EOS)
            src_tokens = lines[align_encoder_id].split()[:max_len -
                                                         1] + [utils._EOS]

            output_file = '{}.{}.svg'.format(output, line_id +
                                             1) if output is not None else None

            utils.heatmap(src_tokens,
                          trg_tokens,
                          weights,
                          output_file=output_file)
Beispiel #5
0
    def decode(self,
               sess,
               beam_size,
               output=None,
               remove_unk=False,
               early_stopping=True,
               use_edits=False,
               **kwargs):
        utils.log('starting decoding')

        # empty `test` means that we read from standard input, which is not possible with multiple encoders
        assert len(self.src_ext) == 1 or self.filenames.test
        # we can't read binary data from standard input
        assert self.filenames.test or self.src_ext[0] not in self.binary_input
        # check that there is the right number of files for decoding
        assert not self.filenames.test or len(self.filenames.test) == len(
            self.src_ext)

        output_file = None
        try:
            output_file = sys.stdout if output is None else open(output, 'w')

            lines = utils.read_lines(self.filenames.test, self.src_ext,
                                     self.binary_input)

            if self.filenames.test is None:  # interactive mode
                batch_size = 1
            else:
                batch_size = self.batch_size
                lines = list(lines)

            hypothesis_iter = self._decode_batch(sess,
                                                 lines,
                                                 batch_size,
                                                 beam_size=beam_size,
                                                 early_stopping=early_stopping,
                                                 remove_unk=remove_unk,
                                                 use_edits=use_edits)

            for hypothesis in hypothesis_iter:
                output_file.write(hypothesis + '\n')
                output_file.flush()
        finally:
            if output_file is not None:
                output_file.close()
Beispiel #6
0
    def decode(self, sess, beam_size, output=None, remove_unk=False, **kwargs):
        utils.log('starting decoding')

        # empty `test` means that we read from standard input, which is not possible with multiple encoders
        assert len(self.src_ext) == 1 or self.filenames.test
        # we can't read binary data from standard input
        assert self.filenames.test or self.src_ext[0] not in self.binary_input
        # check that there is the right number of files for decoding
        assert not self.filenames.test or len(self.filenames.test) == len(
            self.src_ext)

        output_file = None
        try:
            output_file = sys.stdout if output is None else open(output, 'w')

            for lines in utils.read_lines(self.filenames.test, self.src_ext,
                                          self.binary_input):
                trg_sentence = self._decode_sentence(sess, lines, beam_size,
                                                     remove_unk)
                output_file.write(trg_sentence + '\n')
                output_file.flush()
        finally:
            if output_file is not None:
                output_file.close()
Beispiel #7
0
    def evaluate(self,
                 sess,
                 beam_size,
                 score_function,
                 on_dev=True,
                 output=None,
                 remove_unk=False,
                 max_dev_size=None,
                 script_dir='scripts',
                 early_stopping=True,
                 use_edits=False,
                 **kwargs):
        """
        :param score_function: name of the scoring function used to score and rank models
          (typically 'bleu_score')
        :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus
        :param output: save the hypotheses to this file
        :param remove_unk: remove the UNK symbols from the output
        :param max_dev_size: maximum number of lines to read from dev files
        :param script_dir: parameter of scoring functions
        :return: scores of each corpus to evaluate
        """
        utils.log('starting decoding')
        assert on_dev or len(self.filenames.test) == len(self.extensions)

        filenames = self.filenames.dev if on_dev else [self.filenames.test]

        # convert `output` into a list, for zip
        if isinstance(output, str):
            output = [output]
        elif output is None:
            output = [None] * len(filenames)

        scores = []

        for filenames_, output_ in zip(
                filenames, output):  # evaluation on multiple corpora
            lines = list(
                utils.read_lines(filenames_, self.extensions,
                                 self.binary_input))
            if on_dev and max_dev_size:
                lines = lines[:max_dev_size]

            hypotheses = []
            references = []

            output_file = None

            try:
                if output_ is not None:
                    output_file = open(output_, 'w')

                *src_sentences, trg_sentences = zip(*lines)
                src_sentences = list(zip(*src_sentences))

                hypothesis_iter = self._decode_batch(
                    sess,
                    src_sentences,
                    self.batch_size,
                    beam_size=beam_size,
                    early_stopping=early_stopping,
                    remove_unk=remove_unk,
                    use_edits=use_edits)
                for sources, hypothesis, reference in zip(
                        src_sentences, hypothesis_iter, trg_sentences):
                    if use_edits:
                        reference = utils.reverse_edits(sources[0], reference)

                    hypotheses.append(hypothesis)
                    references.append(reference.strip().replace('@@ ', ''))

                    if output_file is not None:
                        output_file.write(hypothesis + '\n')
                        output_file.flush()

            finally:
                if output_file is not None:
                    output_file.close()

            # default scoring function is utils.bleu_score
            score, score_summary = getattr(evaluation, score_function)(
                hypotheses, references, script_dir=script_dir)

            # print the scoring information
            score_info = []
            if self.name is not None:
                score_info.append(self.name)
            score_info.append('score={:.2f}'.format(score))
            if score_summary:
                score_info.append(score_summary)

            utils.log(' '.join(map(str, score_info)))
            scores.append(score)

        return scores
    def evaluate(self,
                 score_functions,
                 on_dev=True,
                 output=None,
                 remove_unk=False,
                 max_dev_size=None,
                 raw_output=False,
                 fix_edits=True,
                 max_test_size=None,
                 post_process_script=None,
                 unk_replace=False,
                 **kwargs):
        """
        Decode a dev or test set, and perform evaluation with respect to gold standard, using the provided
        scoring function. If `output` is defined, also save the decoding output to this file.
        When evaluating development data (`on_dev` to True), several dev sets can be specified (`dev_prefix` parameter
        in configuration files), and a score is computed for each of them.

        :param score_function: name of the scoring function used to score and rank models (typically 'bleu_score')
        :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus
        :param output: save the hypotheses to this file
        :param remove_unk: remove the UNK symbols from the output
        :param max_dev_size: maximum number of lines to read from dev files
        :param max_test_size: maximum number of lines to read from test files
        :param raw_output: save raw decoder output (don't do post-processing like UNK deletion or subword
            concatenation). The evaluation is still done with the post-processed output.
        :param fix_edits: when predicting edit operations, pad shorter hypotheses with KEEP symbols.
        :return: scores of each corpus to evaluate
        """
        utils.log('starting evaluation')

        if on_dev:
            filenames = self.filenames.dev
        else:
            filenames = [self.filenames.test]

        # convert `output` into a list, for zip
        if isinstance(output, str):
            output = [output]
        elif output is None:
            output = [None] * len(filenames)

        scores = []

        # evaluation on multiple corpora
        for dev_id, (filenames_, output_, prefix) in enumerate(
                zip(filenames, output, self.dev_prefix)):
            if self.ref_ext is not None:
                filenames_ = filenames_[:len(self.src_ext)] + filenames_[-1:]

            if self.dev_batches:
                dev_batches = self.dev_batches[dev_id]
                dev_loss = sum(
                    self.seq2seq_model.step(batch, update_model=False).loss *
                    len(batch) for batch in dev_batches)
                dev_loss /= sum(map(len, dev_batches))
            else:  # TODO
                dev_loss = 0

            src_lines = list(
                utils.read_lines(filenames_[:len(self.src_ext)],
                                 binary=self.binary[:len(self.src_ext)]))
            trg_lines = list(utils.read_lines([filenames_[len(self.src_ext)]]))

            assert len(trg_lines) % len(src_lines) == 0

            references = []
            ref_count = len(trg_lines) // len(src_lines)
            for i in range(len(src_lines)):
                ref = trg_lines[i * ref_count:(i + 1) * ref_count]
                ref = [
                    ref_[0].strip().replace('@@ ', '').replace('@@', '')
                    for ref_ in ref
                ]
                references.append(ref)

            if on_dev and max_dev_size:
                max_size = max_dev_size
            elif not on_dev and max_test_size:
                max_size = max_test_size
            else:
                max_size = len(src_lines)

            src_lines = src_lines[:max_size]
            references = references[:max_size]

            hypotheses = []
            output_file = None
            try:
                if output_ is not None:
                    output_file = open(output_, 'w')

                hypothesis_iter = self.decode_batch(src_lines,
                                                    self.batch_size,
                                                    remove_unk=remove_unk,
                                                    fix_edits=fix_edits,
                                                    unk_replace=unk_replace)
                if post_process_script is not None:
                    hypotheses, raw = zip(*hypothesis_iter)
                    data = '\n'.join(hypotheses).encode()
                    data = Popen(
                        [post_process_script], stdout=PIPE,
                        stdin=PIPE).communicate(input=data)[0].decode()
                    hypotheses = data.splitlines()
                    hypothesis_iter = zip(hypotheses, raw)

                for i, hypothesis in enumerate(hypothesis_iter):
                    hypothesis, raw = hypothesis
                    hypotheses.append(hypothesis)
                    if output_file is not None:
                        if raw_output:
                            hypothesis = raw
                        output_file.write(hypothesis + '\n')
                        output_file.flush()
            finally:
                if output_file is not None:
                    output_file.close()

            scores_ = []
            summary = None

            for score_function in score_functions:
                try:
                    if score_function != 'bleu':
                        references_ = [ref[0] for ref in references]
                    else:
                        references_ = references

                    if score_function == 'loss':
                        score = dev_loss
                        reversed_ = True
                    else:
                        fun = getattr(evaluation, 'corpus_' + score_function)
                        try:
                            reversed_ = fun.reversed
                        except AttributeError:
                            reversed_ = False
                        score, score_summary = fun(hypotheses, references_)
                        summary = summary or score_summary

                    scores_.append((score_function, score, reversed_))
                except:
                    pass

            score_info = [
                '{}={:.2f}'.format(key, value) for key, value, _ in scores_
            ]
            score_info.insert(0, prefix)
            if summary:
                score_info.append(summary)

            if self.name is not None:
                score_info.insert(0, self.name)

            utils.log(' '.join(map(str, score_info)))

            # main score
            _, score, reversed_ = scores_[0]
            scores.append(-score if reversed_ else score)

        return scores
Beispiel #9
0
    def evaluate(self,
                 score_function,
                 on_dev=True,
                 output=None,
                 remove_unk=False,
                 max_dev_size=None,
                 raw_output=False,
                 fix_edits=True,
                 max_test_size=None,
                 post_process_script=None,
                 **kwargs):
        """
        Decode a dev or test set, and perform evaluation with respect to gold standard, using the provided
        scoring function. If `output` is defined, also save the decoding output to this file.
        When evaluating development data (`on_dev` to True), several dev sets can be specified (`dev_prefix` parameter
        in configuration files), and a score is computed for each of them.

        :param score_function: name of the scoring function used to score and rank models (typically 'bleu_score')
        :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus
        :param output: save the hypotheses to this file
        :param remove_unk: remove the UNK symbols from the output
        :param max_dev_size: maximum number of lines to read from dev files
        :param max_test_size: maximum number of lines to read from test files
        :param raw_output: save raw decoder output (don't do post-processing like UNK deletion or subword
            concatenation). The evaluation is still done with the post-processed output.
        :param fix_edits: when predicting edit operations, pad shorter hypotheses with KEEP symbols.
        :return: scores of each corpus to evaluate
        """
        utils.log('starting decoding')

        if on_dev:
            filenames = self.filenames.dev
        else:
            filenames = [self.filenames.test]

        # convert `output` into a list, for zip
        if isinstance(output, str):
            output = [output]
        elif output is None:
            output = [None] * len(filenames)

        scores = []

        for filenames_, output_, prefix in zip(
                filenames, output,
                self.dev_prefix):  # evaluation on multiple corpora
            extensions = list(self.extensions)
            if self.ref_ext is not None:
                extensions.append(self.ref_ext)

            lines = list(utils.read_lines(filenames_, binary=self.binary))

            if on_dev and max_dev_size:
                lines = lines[:max_dev_size]
            elif not on_dev and max_test_size:
                lines = lines[:max_test_size]

            hypotheses = []
            references = []

            output_file = None

            try:
                if output_ is not None:
                    output_file = open(output_, 'w')

                lines_ = list(zip(*lines))

                src_sentences = list(zip(*lines_[:len(self.src_ext)]))
                trg_sentences = list(zip(*lines_[len(self.src_ext):]))

                hypothesis_iter = self.decode_batch(lines,
                                                    self.batch_size,
                                                    remove_unk=remove_unk,
                                                    fix_edits=fix_edits)

                for i, (sources, hypothesis, reference) in enumerate(
                        zip(src_sentences, hypothesis_iter, trg_sentences)):
                    if self.ref_ext is not None and on_dev:
                        reference = reference[-1]
                    else:
                        reference = reference[0]  # single output for now

                    hypothesis, raw = hypothesis

                    hypotheses.append(hypothesis)
                    references.append(reference.strip().replace('@@ ', ''))

                    if output_file is not None:
                        if raw_output:
                            hypothesis = raw

                        output_file.write(hypothesis + '\n')
                        output_file.flush()

            finally:
                if output_file is not None:
                    output_file.close()

            if post_process_script is not None:
                data = '\n'.join(hypotheses).encode()
                data = Popen([post_process_script], stdout=PIPE,
                             stdin=PIPE).communicate(input=data)[0].decode()
                hypotheses = data.splitlines()

            # default scoring function is utils.bleu_score
            score, score_summary = getattr(evaluation,
                                           score_function)(hypotheses,
                                                           references)

            # print scoring information
            score_info = [prefix, 'score={:.2f}'.format(score)]

            if score_summary:
                score_info.append(score_summary)

            if self.name is not None:
                score_info.insert(0, self.name)

            utils.log(' '.join(map(str, score_info)))
            scores.append(score)

        return scores
Beispiel #10
0
    def evaluate(self, score_functions, on_dev=True, output=None, remove_unk=False, max_dev_size=None,
                 raw_output=False, fix_edits=True, max_test_size=None, post_process_script=None,
                 unk_replace=False, **kwargs):
        """
        Decode a dev or test set, and perform evaluation with respect to gold standard, using the provided
        scoring function. If `output` is defined, also save the decoding output to this file.
        When evaluating development data (`on_dev` to True), several dev sets can be specified (`dev_prefix` parameter
        in configuration files), and a score is computed for each of them.

        :param score_function: name of the scoring function used to score and rank models (typically 'bleu_score')
        :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus
        :param output: save the hypotheses to this file
        :param remove_unk: remove the UNK symbols from the output
        :param max_dev_size: maximum number of lines to read from dev files
        :param max_test_size: maximum number of lines to read from test files
        :param raw_output: save raw decoder output (don't do post-processing like UNK deletion or subword
            concatenation). The evaluation is still done with the post-processed output.
        :param fix_edits: when predicting edit operations, pad shorter hypotheses with KEEP symbols.
        :return: scores of each corpus to evaluate
        """
        utils.log('starting evaluation')

        if on_dev:
            filenames = self.filenames.dev
        else:
            filenames = [self.filenames.test]

        # convert `output` into a list, for zip
        if isinstance(output, str):
            output = [output]
        elif output is None:
            output = [None] * len(filenames)

        scores = []
        utils.log('show output')
        utils.log(output)

        # evaluation on multiple corpora
        for dev_id, (filenames_, output_, prefix) in enumerate(zip(filenames, output, self.dev_prefix)):
            utils.log('filenames, output, self.dev_prefix')
            utils.log(filenames)
            utils.log(output)

            if self.dev_batches:
                dev_batches = self.dev_batches[dev_id]
                dev_loss = sum(self.seq2seq_model.step(batch, update_model=False).loss * len(batch)
                               for batch in dev_batches)
                dev_loss /= sum(map(len, dev_batches))
            else:  # TODO
                dev_loss = 0

            extensions = list(self.extensions)
            if self.ref_ext is not None:
                extensions.append(self.ref_ext)

            lines = list(utils.read_lines(filenames_, binary=self.binary))

            if on_dev and max_dev_size:
                lines = lines[:max_dev_size]
            elif not on_dev and max_test_size:
                lines = lines[:max_test_size]

            hypotheses = []
            references = []
            utils.log("making hypotheses")
            output_file = None
            try:
                if output_ is not None:
                    output_file = open(output_, 'w', encoding='utf-8')

                lines_ = list(zip(*lines))

                src_sentences = list(zip(*lines_[:len(self.src_ext)]))
                trg_sentences = list(zip(*lines_[len(self.src_ext):]))

                utils.log("making decode_batch")
                hypothesis_iter = self.decode_batch(lines, self.batch_size, remove_unk=remove_unk,
                                                    fix_edits=fix_edits, unk_replace=unk_replace)

                for i, (sources, hypothesis, reference) in enumerate(zip(src_sentences, hypothesis_iter,
                                                                         trg_sentences)):
                    if self.ref_ext is not None and on_dev:
                        reference = reference[-1]
                    else:
                        reference = reference[0]  # single output for now

                    hypothesis, raw = hypothesis
                    # hypothesis: [10items],each item is a "token sequence"
                    hypotheses.append(hypothesis)
                    references.append(reference.strip().replace('@@ ', ''))

                    if output_file is not None:
                        if raw_output:
                            hypothesis = raw
                        line = "source:\t" + str(sources) + "\nref:\t" + str(reference) + "\n"
                        for item in hypothesis:
                            line += str(item) + '\n'
                        line += "\n"
                        # line = hypothesis + '\n'
                        output_file.write(line)
                        output_file.flush()

            finally:
                if output_file is not None:
                    output_file.close()

            if post_process_script is not None:
                data = '\n'.join(hypotheses).encode()
                data = Popen([post_process_script], stdout=PIPE, stdin=PIPE).communicate(input=data)[0].decode()
                hypotheses = data.splitlines()

            scores_ = []
            summary = None

            for score_function in score_functions:
                try:
                    if score_function == 'loss':
                        score = dev_loss
                        reversed_ = True
                    else:
                        fun = getattr(evaluation, 'corpus_' + score_function)
                        try:
                            reversed_ = fun.reversed
                        except AttributeError:
                            reversed_ = False

                        func_arg = []
                        for item in hypotheses:
                            func_arg.append(item[0])
                        score, score_summary = fun(func_arg, references)
                        summary = summary or score_summary

                    scores_.append((score_function, score, reversed_))
                except:
                    pass

            score_info = ['{}={:.2f}'.format(key, value) for key, value, _ in scores_]
            score_info.insert(0, prefix)
            if summary:
                score_info.append(summary)

            if self.name is not None:
                score_info.insert(0, self.name)

            utils.log(' '.join(map(str, score_info)))

            # main score
            _, score, reversed_ = scores_[0]
            scores.append(-score if reversed_ else score)

        return scores
Beispiel #11
0
    def evaluate(self,
                 sess,
                 beam_size,
                 score_function,
                 on_dev=True,
                 output=None,
                 remove_unk=False,
                 auxiliary_score_function=None,
                 script_dir='scripts',
                 **kwargs):
        """
        :param score_function: name of the scoring function used to score and rank models
          (typically 'bleu_score')
        :param on_dev: if True, evaluate the dev corpus, otherwise evaluate the test corpus
        :param output: save the hypotheses to this file
        :param remove_unk: remove the UNK symbols from the output
        :param auxiliary_score_function: optional scoring function used to display a more
          detailed summary.
        :param script_dir: parameter of scoring functions
        :return: scores of each corpus to evaluate
        """
        utils.log('starting decoding')
        assert on_dev or len(self.filenames.test) == len(self.extensions)

        filenames = self.filenames.dev if on_dev else [self.filenames.test]

        # convert `output` into a list, for zip
        if isinstance(output, str):
            output = [output]
        elif output is None:
            output = [None] * len(filenames)

        scores = []

        for filenames_, output_ in zip(
                filenames, output):  # evaluation on multiple corpora
            lines = list(
                utils.read_lines(filenames_, self.extensions,
                                 self.binary_input))

            hypotheses = []
            references = []

            try:
                output_file = open(output_,
                                   'w') if output_ is not None else None

                for *src_sentences, trg_sentence in lines:
                    hypotheses.append(
                        self._decode_sentence(sess, src_sentences, beam_size,
                                              remove_unk))
                    references.append(trg_sentence.strip().replace('@@ ', ''))
                    if output_file is not None:
                        output_file.write(hypotheses[-1] + '\n')
                        output_file.flush()

            finally:
                if output_file is not None:
                    output_file.close()

            # main scoring function (used to choose which checkpoints to keep)
            # default is utils.bleu_score
            score, score_summary = getattr(utils, score_function)(hypotheses,
                                                                  references,
                                                                  script_dir)

            # optionally use an auxiliary function to get different scoring information
            if auxiliary_score_function is not None and auxiliary_score_function != score_function:
                try:
                    _, score_summary = getattr(
                        utils, auxiliary_score_function)(hypotheses,
                                                         references,
                                                         script_dir)
                except:
                    pass

            # print the scoring information
            score_info = []
            if self.name is not None:
                score_info.append(self.name)
            score_info.append('score={}'.format(score))
            if score_summary:
                score_info.append(score_summary)

            utils.log(' '.join(map(str, score_info)))
            scores.append(score)

        return scores