Ejemplo n.º 1
0
    def count_lines(self):
        if self._lines_count < 0 < len(self.langs):
            with self._lock:
                if self._lines_count < 0 < len(self.langs):
                    self._lines_count = fileutils.linecount(self.get_file(self.langs[0]))

        return self._lines_count
Ejemplo n.º 2
0
    def count_lines(self):
        if self._lines_count < 0 < len(self.langs):
            with self._lock:
                if self._lines_count < 0 < len(self.langs):
                    self._lines_count = fileutils.linecount(
                        self.get_file(self.langs[0]))

        return self._lines_count
Ejemplo n.º 3
0
    def evaluate(self, corpora, heval_output=None, debug=False):
        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs]
        if len(corpora) == 0:
            raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang))

        if heval_output is not None:
            fileutils.makedirs(heval_output, exist_ok=True)

        logger = _evaluate_logger()
        logger.start(corpora)

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            results = []

            # Process references
            with logger.step('Preparing corpora') as _:
                corpora_path = os.path.join(working_dir, 'corpora')
                corpora = self._xmlencoder.encode(corpora, corpora_path)

                reference = os.path.join(working_dir, 'reference.' + target_lang)
                source = os.path.join(working_dir, 'source.' + source_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference)
                fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

                if heval_output is not None:
                    self._heval_outputter.write(lang=target_lang, input_file=reference,
                                                output_file=os.path.join(heval_output, 'reference.' + target_lang))
                    self._heval_outputter.write(lang=source_lang, input_file=source,
                                                output_file=os.path.join(heval_output, 'source.' + source_lang))

            # Translate
            for translator in self._translators:
                name = translator.name()

                with logger.step('Translating with %s' % name) as _:
                    result = _EvaluationResult(translator)
                    results.append(result)

                    translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
                    xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
                    fileutils.makedirs(translations_path, exist_ok=True)

                    try:
                        translated, mtt, parallelism = translator.translate(corpora, translations_path)
                        filename = result.id + '.' + target_lang

                        result.mtt = mtt
                        result.parallelism = parallelism
                        result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                        result.merge = os.path.join(working_dir, filename)

                        fileutils.merge([corpus.get_file(target_lang)
                                         for corpus in result.translated_corpora], result.merge)

                        if heval_output is not None:
                            self._heval_outputter.write(lang=target_lang, input_file=result.merge,
                                                        output_file=os.path.join(heval_output, filename))
                    except TranslateError as e:
                        result.error = e
                    except Exception as e:
                        result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            # Check corpora length
            reference_lines = fileutils.linecount(reference)
            for result in results:
                if result.error is not None:
                    continue

                lines = fileutils.linecount(result.merge)

                if lines != reference_lines:
                    raise TranslateError('Invalid line count for translator %s: expected %d, found %d.'
                                         % (result.translator.name(), reference_lines, lines))

            # Scoring
            scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')]

            for scorer, field in scorers:
                with logger.step('Calculating %s' % scorer.name()) as _:
                    for result in results:
                        if result.error is not None:
                            continue
                        setattr(result, field, scorer.calculate(result.merge, reference))

            logger.completed(results, scorers)

            return results
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')
Ejemplo n.º 4
0
    def evaluate(self, corpora, heval_output=None, debug=False):
        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')
        if heval_output is not None:
            fileutils.makedirs(heval_output, exist_ok=True)

        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        logger = _evaluate_logger()
        logger.start(corpora)

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            results = []

            # Process references
            with logger.step('Preparing corpora') as _:
                corpora_path = os.path.join(working_dir, 'corpora')
                corpora = self._xmlencoder.encode(corpora, corpora_path)

                reference = os.path.join(working_dir, 'reference.' + target_lang)
                source = os.path.join(working_dir, 'source.' + source_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference)
                fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

                if heval_output is not None:
                    self._heval_outputter.write(lang=target_lang, input_file=reference,
                                                output_file=os.path.join(heval_output, 'reference.' + target_lang))
                    self._heval_outputter.write(lang=source_lang, input_file=source,
                                                output_file=os.path.join(heval_output, 'source.' + source_lang))

            # Translate
            for translator in self._translators:
                name = translator.name()

                with logger.step('Translating with %s' % name) as _:
                    result = _EvaluationResult(translator)
                    results.append(result)

                    translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
                    xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
                    fileutils.makedirs(translations_path, exist_ok=True)

                    try:
                        translated, mtt, parallelism = translator.translate(corpora, translations_path)
                        filename = result.id + '.' + target_lang

                        result.mtt = mtt
                        result.parallelism = parallelism
                        result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                        result.merge = os.path.join(working_dir, filename)

                        fileutils.merge([corpus.get_file(target_lang)
                                         for corpus in result.translated_corpora], result.merge)

                        if heval_output is not None:
                            self._heval_outputter.write(lang=target_lang, input_file=result.merge,
                                                        output_file=os.path.join(heval_output, filename))
                    except TranslateError as e:
                        result.error = e
                    except Exception as e:
                        result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            # Check corpora length
            reference_lines = fileutils.linecount(reference)
            for result in results:
                if result.error is not None:
                    continue
                    
                lines = fileutils.linecount(result.merge)

                if lines != reference_lines:
                    raise TranslateError('Invalid line count for translator %s: expected %d, found %d.'
                                         % (result.translator.name(), reference_lines, lines))

            # Scoring
            scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')]

            for scorer, field in scorers:
                with logger.step('Calculating %s' % scorer.name()) as _:
                    for result in results:
                        if result.error is not None:
                            continue
                        setattr(result, field, scorer.calculate(result.merge, reference))

            logger.completed(results, scorers)

            return results
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')