class Evaluator: def __init__(self, node, google_key=None, use_sessions=True): self._engine = node.engine self._node = node self._heval_outputter = HumanEvaluationFileOutputter() self._xmlencoder = XMLEncoder() self._translators = [ GoogleTranslate(self._engine.source_lang, self._engine.target_lang, key=google_key), # BingTranslator(source_lang, target_lang), MMTTranslator(self._node, use_sessions) ] def evaluate(self, corpora, heval_output=None, debug=False): if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if heval_output is not None: fileutils.makedirs(heval_output, exist_ok=True) target_lang = self._engine.target_lang source_lang = self._engine.source_lang logger = _evaluate_logger() logger.start(corpora) working_dir = self._engine.get_tempdir('evaluation') try: results = [] # Process references with logger.step('Preparing corpora') as _: corpora_path = os.path.join(working_dir, 'corpora') corpora = self._xmlencoder.encode(corpora, corpora_path) reference = os.path.join(working_dir, 'reference.' + target_lang) source = os.path.join(working_dir, 'source.' + source_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference) fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source) if heval_output is not None: self._heval_outputter.write(lang=target_lang, input_file=reference, output_file=os.path.join(heval_output, 'reference.' + target_lang)) self._heval_outputter.write(lang=source_lang, input_file=source, output_file=os.path.join(heval_output, 'source.' + source_lang)) # Translate for translator in self._translators: name = translator.name() with logger.step('Translating with %s' % name) as _: result = _EvaluationResult(translator) results.append(result) translations_path = os.path.join(working_dir, 'translations', result.id + '.raw') xmltranslations_path = os.path.join(working_dir, 'translations', result.id) fileutils.makedirs(translations_path, exist_ok=True) try: translated, mtt, parallelism = translator.translate(corpora, translations_path) filename = result.id + '.' + target_lang result.mtt = mtt result.parallelism = parallelism result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path) result.merge = os.path.join(working_dir, filename) fileutils.merge([corpus.get_file(target_lang) for corpus in result.translated_corpora], result.merge) if heval_output is not None: self._heval_outputter.write(lang=target_lang, input_file=result.merge, output_file=os.path.join(heval_output, filename)) except TranslateError as e: result.error = e except Exception as e: result.error = TranslateError('Unexpected ERROR: ' + str(e.message)) # Check corpora length reference_lines = fileutils.linecount(reference) for result in results: if result.error is not None: continue lines = fileutils.linecount(result.merge) if lines != reference_lines: raise TranslateError('Invalid line count for translator %s: expected %d, found %d.' % (result.translator.name(), reference_lines, lines)) # Scoring scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')] for scorer, field in scorers: with logger.step('Calculating %s' % scorer.name()) as _: for result in results: if result.error is not None: continue setattr(result, field, scorer.calculate(result.merge, reference)) logger.completed(results, scorers) return results finally: if not debug: self._engine.clear_tempdir('evaluation')
class Evaluator: def __init__(self, node, google_key=None, google_nmt=False): self._engine = node.engine self._node = node self._heval_outputter = HumanEvaluationFileOutputter() self._xmlencoder = XMLEncoder() self._translators = [ GoogleTranslate(self._engine.source_lang, self._engine.target_lang, key=google_key, nmt=google_nmt), # BingTranslator(source_lang, target_lang), MMTTranslator(self._node) ] def evaluate(self, corpora, heval_output=None, debug=False): target_lang = self._engine.target_lang source_lang = self._engine.source_lang corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs] if len(corpora) == 0: raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang)) if heval_output is not None: fileutils.makedirs(heval_output, exist_ok=True) logger = _evaluate_logger() logger.start(corpora) working_dir = self._engine.get_tempdir('evaluation') try: results = [] # Process references with logger.step('Preparing corpora') as _: corpora_path = os.path.join(working_dir, 'corpora') corpora = self._xmlencoder.encode(corpora, corpora_path) reference = os.path.join(working_dir, 'reference.' + target_lang) source = os.path.join(working_dir, 'source.' + source_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference) fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source) if heval_output is not None: self._heval_outputter.write(lang=target_lang, input_file=reference, output_file=os.path.join(heval_output, 'reference.' + target_lang)) self._heval_outputter.write(lang=source_lang, input_file=source, output_file=os.path.join(heval_output, 'source.' + source_lang)) # Translate for translator in self._translators: name = translator.name() with logger.step('Translating with %s' % name) as _: result = _EvaluationResult(translator) results.append(result) translations_path = os.path.join(working_dir, 'translations', result.id + '.raw') xmltranslations_path = os.path.join(working_dir, 'translations', result.id) fileutils.makedirs(translations_path, exist_ok=True) try: translated, mtt, parallelism = translator.translate(corpora, translations_path) filename = result.id + '.' + target_lang result.mtt = mtt result.parallelism = parallelism result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path) result.merge = os.path.join(working_dir, filename) fileutils.merge([corpus.get_file(target_lang) for corpus in result.translated_corpora], result.merge) if heval_output is not None: self._heval_outputter.write(lang=target_lang, input_file=result.merge, output_file=os.path.join(heval_output, filename)) except TranslateError as e: result.error = e except Exception as e: result.error = TranslateError('Unexpected ERROR: ' + str(e.message)) # Check corpora length reference_lines = fileutils.linecount(reference) for result in results: if result.error is not None: continue lines = fileutils.linecount(result.merge) if lines != reference_lines: raise TranslateError('Invalid line count for translator %s: expected %d, found %d.' % (result.translator.name(), reference_lines, lines)) # Scoring scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')] for scorer, field in scorers: with logger.step('Calculating %s' % scorer.name()) as _: for result in results: if result.error is not None: continue setattr(result, field, scorer.calculate(result.merge, reference)) logger.completed(results, scorers) return results finally: if not debug: self._engine.clear_tempdir('evaluation')
class BatchTranslator: def __init__(self, node, use_sessions=True): self._engine = node.engine self._node = node self._xmlencoder = XMLEncoder() self._translator = MMTTranslator(self._node, use_sessions) def translate(self, corpora, dest_path=None, debug=False): if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if dest_path: fileutils.makedirs(dest_path, exist_ok=True) target_lang = self._engine.target_lang source_lang = self._engine.source_lang working_dir = self._engine.get_tempdir('evaluation') have_references = False try: results = [] # Process references corpora_path = os.path.join(working_dir, 'corpora') corpora = self._xmlencoder.encode(corpora, corpora_path) reference = os.path.join(working_dir, 'reference.' + target_lang) source = os.path.join(working_dir, 'source.' + source_lang) refs = [corpus.get_file(target_lang) for corpus in corpora if corpus.get_file(target_lang)] have_references = len(refs) > 0 fileutils.merge(refs, reference) # tolerates missing reference fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source) if dest_path: for corpus in corpora: corpus.copy(dest_path, suffixes={source_lang: '.src', target_lang: '.ref', 'tmx': '.src'}) # Translate translator = self._translator name = translator.name() result = _EvaluationResult(translator) results.append(result) translations_path = os.path.join(working_dir, 'translations', result.id + '.raw') xmltranslations_path = os.path.join(working_dir, 'translations', result.id) fileutils.makedirs(translations_path, exist_ok=True) try: translated, mtt, parallelism = translator.translate(corpora, translations_path) filename = result.id + '.' + target_lang result.mtt = mtt result.parallelism = parallelism result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path) result.merge = os.path.join(working_dir, filename) fileutils.merge([corpus.get_file(target_lang) for corpus in result.translated_corpora], result.merge) if dest_path: for corpus in result.translated_corpora: corpus.copy(dest_path, suffixes={target_lang: '.hyp', 'tmx': '.hyp'}) except TranslateError as e: result.error = e except Exception as e: result.error = TranslateError('Unexpected ERROR: ' + str(e.message)) if result.error is None: if have_references: scorer = BLEUScore() # bleu in range [0;1) bleu = scorer.calculate(result.merge, reference) return bleu else: return True else: print(result.error) return None finally: if not debug: self._engine.clear_tempdir('evaluation')