예제 #1
0
파일: evaluation.py 프로젝트: ModernMT/MMT
class Evaluator:
    def __init__(self, node, google_key=None, use_sessions=True):
        self._engine = node.engine
        self._node = node

        self._heval_outputter = HumanEvaluationFileOutputter()
        self._xmlencoder = XMLEncoder()
        self._translators = [
            GoogleTranslate(self._engine.source_lang, self._engine.target_lang, key=google_key),
            # BingTranslator(source_lang, target_lang),
            MMTTranslator(self._node, use_sessions)
        ]

    def evaluate(self, corpora, heval_output=None, debug=False):
        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')
        if heval_output is not None:
            fileutils.makedirs(heval_output, exist_ok=True)

        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        logger = _evaluate_logger()
        logger.start(corpora)

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            results = []

            # Process references
            with logger.step('Preparing corpora') as _:
                corpora_path = os.path.join(working_dir, 'corpora')
                corpora = self._xmlencoder.encode(corpora, corpora_path)

                reference = os.path.join(working_dir, 'reference.' + target_lang)
                source = os.path.join(working_dir, 'source.' + source_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference)
                fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

                if heval_output is not None:
                    self._heval_outputter.write(lang=target_lang, input_file=reference,
                                                output_file=os.path.join(heval_output, 'reference.' + target_lang))
                    self._heval_outputter.write(lang=source_lang, input_file=source,
                                                output_file=os.path.join(heval_output, 'source.' + source_lang))

            # Translate
            for translator in self._translators:
                name = translator.name()

                with logger.step('Translating with %s' % name) as _:
                    result = _EvaluationResult(translator)
                    results.append(result)

                    translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
                    xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
                    fileutils.makedirs(translations_path, exist_ok=True)

                    try:
                        translated, mtt, parallelism = translator.translate(corpora, translations_path)
                        filename = result.id + '.' + target_lang

                        result.mtt = mtt
                        result.parallelism = parallelism
                        result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                        result.merge = os.path.join(working_dir, filename)

                        fileutils.merge([corpus.get_file(target_lang)
                                         for corpus in result.translated_corpora], result.merge)

                        if heval_output is not None:
                            self._heval_outputter.write(lang=target_lang, input_file=result.merge,
                                                        output_file=os.path.join(heval_output, filename))
                    except TranslateError as e:
                        result.error = e
                    except Exception as e:
                        result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            # Check corpora length
            reference_lines = fileutils.linecount(reference)
            for result in results:
                if result.error is not None:
                    continue
                    
                lines = fileutils.linecount(result.merge)

                if lines != reference_lines:
                    raise TranslateError('Invalid line count for translator %s: expected %d, found %d.'
                                         % (result.translator.name(), reference_lines, lines))

            # Scoring
            scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')]

            for scorer, field in scorers:
                with logger.step('Calculating %s' % scorer.name()) as _:
                    for result in results:
                        if result.error is not None:
                            continue
                        setattr(result, field, scorer.calculate(result.merge, reference))

            logger.completed(results, scorers)

            return results
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')
예제 #2
0
class Evaluator:
    def __init__(self, node, google_key=None, google_nmt=False):
        self._engine = node.engine
        self._node = node

        self._heval_outputter = HumanEvaluationFileOutputter()
        self._xmlencoder = XMLEncoder()
        self._translators = [
            GoogleTranslate(self._engine.source_lang, self._engine.target_lang, key=google_key, nmt=google_nmt),
            # BingTranslator(source_lang, target_lang),
            MMTTranslator(self._node)
        ]

    def evaluate(self, corpora, heval_output=None, debug=False):
        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs]
        if len(corpora) == 0:
            raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang))

        if heval_output is not None:
            fileutils.makedirs(heval_output, exist_ok=True)

        logger = _evaluate_logger()
        logger.start(corpora)

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            results = []

            # Process references
            with logger.step('Preparing corpora') as _:
                corpora_path = os.path.join(working_dir, 'corpora')
                corpora = self._xmlencoder.encode(corpora, corpora_path)

                reference = os.path.join(working_dir, 'reference.' + target_lang)
                source = os.path.join(working_dir, 'source.' + source_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference)
                fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

                if heval_output is not None:
                    self._heval_outputter.write(lang=target_lang, input_file=reference,
                                                output_file=os.path.join(heval_output, 'reference.' + target_lang))
                    self._heval_outputter.write(lang=source_lang, input_file=source,
                                                output_file=os.path.join(heval_output, 'source.' + source_lang))

            # Translate
            for translator in self._translators:
                name = translator.name()

                with logger.step('Translating with %s' % name) as _:
                    result = _EvaluationResult(translator)
                    results.append(result)

                    translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
                    xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
                    fileutils.makedirs(translations_path, exist_ok=True)

                    try:
                        translated, mtt, parallelism = translator.translate(corpora, translations_path)
                        filename = result.id + '.' + target_lang

                        result.mtt = mtt
                        result.parallelism = parallelism
                        result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                        result.merge = os.path.join(working_dir, filename)

                        fileutils.merge([corpus.get_file(target_lang)
                                         for corpus in result.translated_corpora], result.merge)

                        if heval_output is not None:
                            self._heval_outputter.write(lang=target_lang, input_file=result.merge,
                                                        output_file=os.path.join(heval_output, filename))
                    except TranslateError as e:
                        result.error = e
                    except Exception as e:
                        result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            # Check corpora length
            reference_lines = fileutils.linecount(reference)
            for result in results:
                if result.error is not None:
                    continue

                lines = fileutils.linecount(result.merge)

                if lines != reference_lines:
                    raise TranslateError('Invalid line count for translator %s: expected %d, found %d.'
                                         % (result.translator.name(), reference_lines, lines))

            # Scoring
            scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')]

            for scorer, field in scorers:
                with logger.step('Calculating %s' % scorer.name()) as _:
                    for result in results:
                        if result.error is not None:
                            continue
                        setattr(result, field, scorer.calculate(result.merge, reference))

            logger.completed(results, scorers)

            return results
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')
예제 #3
0
class BatchTranslator:
    def __init__(self, node, use_sessions=True):
        self._engine = node.engine
        self._node = node

        self._xmlencoder = XMLEncoder()
        self._translator = MMTTranslator(self._node, use_sessions)

    def translate(self, corpora, dest_path=None, debug=False):
        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if dest_path:
            fileutils.makedirs(dest_path, exist_ok=True)

        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        working_dir = self._engine.get_tempdir('evaluation')
        have_references = False

        try:
            results = []

            # Process references
            corpora_path = os.path.join(working_dir, 'corpora')
            corpora = self._xmlencoder.encode(corpora, corpora_path)

            reference = os.path.join(working_dir, 'reference.' + target_lang)
            source = os.path.join(working_dir, 'source.' + source_lang)
            refs = [corpus.get_file(target_lang) for corpus in corpora if corpus.get_file(target_lang)]
            have_references = len(refs) > 0
            fileutils.merge(refs, reference)  # tolerates missing reference
            fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

            if dest_path:
                for corpus in corpora:
                    corpus.copy(dest_path, suffixes={source_lang: '.src', target_lang: '.ref', 'tmx': '.src'})

            # Translate
            translator = self._translator
            name = translator.name()

            result = _EvaluationResult(translator)
            results.append(result)

            translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
            xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
            fileutils.makedirs(translations_path, exist_ok=True)

            try:
                translated, mtt, parallelism = translator.translate(corpora, translations_path)
                filename = result.id + '.' + target_lang

                result.mtt = mtt
                result.parallelism = parallelism
                result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                result.merge = os.path.join(working_dir, filename)

                fileutils.merge([corpus.get_file(target_lang)
                                 for corpus in result.translated_corpora], result.merge)

                if dest_path:
                    for corpus in result.translated_corpora:
                        corpus.copy(dest_path, suffixes={target_lang: '.hyp', 'tmx': '.hyp'})

            except TranslateError as e:
                result.error = e
            except Exception as e:
                result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            if result.error is None:
                if have_references:
                    scorer = BLEUScore()
                    # bleu in range [0;1)
                    bleu = scorer.calculate(result.merge, reference)
                    return bleu
                else:
                    return True
            else:
                print(result.error)
                return None
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')