def count_lines(self): if self._lines_count < 0 < len(self.langs): with self._lock: if self._lines_count < 0 < len(self.langs): self._lines_count = fileutils.linecount(self.get_file(self.langs[0])) return self._lines_count
def count_lines(self): if self._lines_count < 0 < len(self.langs): with self._lock: if self._lines_count < 0 < len(self.langs): self._lines_count = fileutils.linecount( self.get_file(self.langs[0])) return self._lines_count
def evaluate(self, corpora, heval_output=None, debug=False): target_lang = self._engine.target_lang source_lang = self._engine.source_lang corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs] if len(corpora) == 0: raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang)) if heval_output is not None: fileutils.makedirs(heval_output, exist_ok=True) logger = _evaluate_logger() logger.start(corpora) working_dir = self._engine.get_tempdir('evaluation') try: results = [] # Process references with logger.step('Preparing corpora') as _: corpora_path = os.path.join(working_dir, 'corpora') corpora = self._xmlencoder.encode(corpora, corpora_path) reference = os.path.join(working_dir, 'reference.' + target_lang) source = os.path.join(working_dir, 'source.' + source_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference) fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source) if heval_output is not None: self._heval_outputter.write(lang=target_lang, input_file=reference, output_file=os.path.join(heval_output, 'reference.' + target_lang)) self._heval_outputter.write(lang=source_lang, input_file=source, output_file=os.path.join(heval_output, 'source.' + source_lang)) # Translate for translator in self._translators: name = translator.name() with logger.step('Translating with %s' % name) as _: result = _EvaluationResult(translator) results.append(result) translations_path = os.path.join(working_dir, 'translations', result.id + '.raw') xmltranslations_path = os.path.join(working_dir, 'translations', result.id) fileutils.makedirs(translations_path, exist_ok=True) try: translated, mtt, parallelism = translator.translate(corpora, translations_path) filename = result.id + '.' + target_lang result.mtt = mtt result.parallelism = parallelism result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path) result.merge = os.path.join(working_dir, filename) fileutils.merge([corpus.get_file(target_lang) for corpus in result.translated_corpora], result.merge) if heval_output is not None: self._heval_outputter.write(lang=target_lang, input_file=result.merge, output_file=os.path.join(heval_output, filename)) except TranslateError as e: result.error = e except Exception as e: result.error = TranslateError('Unexpected ERROR: ' + str(e.message)) # Check corpora length reference_lines = fileutils.linecount(reference) for result in results: if result.error is not None: continue lines = fileutils.linecount(result.merge) if lines != reference_lines: raise TranslateError('Invalid line count for translator %s: expected %d, found %d.' % (result.translator.name(), reference_lines, lines)) # Scoring scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')] for scorer, field in scorers: with logger.step('Calculating %s' % scorer.name()) as _: for result in results: if result.error is not None: continue setattr(result, field, scorer.calculate(result.merge, reference)) logger.completed(results, scorers) return results finally: if not debug: self._engine.clear_tempdir('evaluation')
def evaluate(self, corpora, heval_output=None, debug=False): if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if heval_output is not None: fileutils.makedirs(heval_output, exist_ok=True) target_lang = self._engine.target_lang source_lang = self._engine.source_lang logger = _evaluate_logger() logger.start(corpora) working_dir = self._engine.get_tempdir('evaluation') try: results = [] # Process references with logger.step('Preparing corpora') as _: corpora_path = os.path.join(working_dir, 'corpora') corpora = self._xmlencoder.encode(corpora, corpora_path) reference = os.path.join(working_dir, 'reference.' + target_lang) source = os.path.join(working_dir, 'source.' + source_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference) fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source) if heval_output is not None: self._heval_outputter.write(lang=target_lang, input_file=reference, output_file=os.path.join(heval_output, 'reference.' + target_lang)) self._heval_outputter.write(lang=source_lang, input_file=source, output_file=os.path.join(heval_output, 'source.' + source_lang)) # Translate for translator in self._translators: name = translator.name() with logger.step('Translating with %s' % name) as _: result = _EvaluationResult(translator) results.append(result) translations_path = os.path.join(working_dir, 'translations', result.id + '.raw') xmltranslations_path = os.path.join(working_dir, 'translations', result.id) fileutils.makedirs(translations_path, exist_ok=True) try: translated, mtt, parallelism = translator.translate(corpora, translations_path) filename = result.id + '.' + target_lang result.mtt = mtt result.parallelism = parallelism result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path) result.merge = os.path.join(working_dir, filename) fileutils.merge([corpus.get_file(target_lang) for corpus in result.translated_corpora], result.merge) if heval_output is not None: self._heval_outputter.write(lang=target_lang, input_file=result.merge, output_file=os.path.join(heval_output, filename)) except TranslateError as e: result.error = e except Exception as e: result.error = TranslateError('Unexpected ERROR: ' + str(e.message)) # Check corpora length reference_lines = fileutils.linecount(reference) for result in results: if result.error is not None: continue lines = fileutils.linecount(result.merge) if lines != reference_lines: raise TranslateError('Invalid line count for translator %s: expected %d, found %d.' % (result.translator.name(), reference_lines, lines)) # Scoring scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')] for scorer, field in scorers: with logger.step('Calculating %s' % scorer.name()) as _: for result in results: if result.error is not None: continue setattr(result, field, scorer.calculate(result.merge, reference)) logger.completed(results, scorers) return results finally: if not debug: self._engine.clear_tempdir('evaluation')