Beispiel #1
0
    def load(name):
        if os.sep in name:
            raise IllegalArgumentException('Invalid engine name: "%s"' % name)

        config_path = Engine._get_config_path(name)

        if not os.path.isfile(config_path):
            raise IllegalArgumentException("Engine '%s' not found" % name)

        # parse the source language and target language from the configuration file
        def _get_child(root, child_name):
            elements = root.getElementsByTagName(child_name)
            return elements[0] if len(elements) > 0 else None

        languages = []

        config_root = minidom.parse(config_path).documentElement
        engine_el = _get_child(config_root, 'engine')
        lang_el = _get_child(engine_el, 'languages')

        if lang_el is not None:
            for pair_el in lang_el.getElementsByTagName('pair'):
                source_lang = pair_el.getAttribute('source')
                target_lang = pair_el.getAttribute('target')
                languages.append((source_lang, target_lang))
        else:
            source_lang = engine_el.getAttribute('source-language')
            target_lang = engine_el.getAttribute('target-language')
            languages.append((source_lang, target_lang))

        return Engine(name, languages)
Beispiel #2
0
    def get_memory_id_by_name(self, name):
        try:
            return int(name)
        except ValueError:
            memories = self.api.get_all_memories()
            ids = [m['id'] for m in memories if m['name'] == name]

            if len(ids) == 0:
                raise IllegalArgumentException('unable to find memory "%s"' % name)
            elif len(ids) > 1:
                raise IllegalArgumentException(
                    'ambiguous memory name "%s", choose one of the following ids: %s' % (name, str(ids)))
            else:
                return ids[0]
Beispiel #3
0
    def __init__(self,
                 name,
                 source_lang,
                 target_lang,
                 roots,
                 debug=False,
                 steps=None,
                 split_trainingset=True):

        # the builder already prepares a basic, "empty" engine
        self._engine = MMTEngine(name, source_lang, target_lang)
        self._roots = roots
        self._debug = debug
        self._split_trainingset = split_trainingset

        self._temp_dir = None
        self._checkpoint_path = None

        self._scheduled_steps = None
        self._passed_steps = None

        # if no steps are passed, all training steps must be performed.
        # else, only perform the passed training steps (if they are all legal)
        if steps is None:
            self._scheduled_steps = self.DEFAULT_TRAINING_STEPS
        else:
            unknown_steps = [
                step for step in steps
                if step not in self.DEFAULT_TRAINING_STEPS
            ]
            if len(unknown_steps) > 0:
                raise IllegalArgumentException('Unknown training steps: ' +
                                               str(unknown_steps))
            self._scheduled_steps = steps
        print
Beispiel #4
0
    def load(name):
        # figure the configuration file path from the engine name
        config_path = Engine._get_config_path(name)

        if not os.path.isfile(config_path):
            raise IllegalArgumentException("Engine '%s' not found" % name)

        # parse the source language and target language from the configuration file
        engine_el = minidom.parse(
            config_path).documentElement.getElementsByTagName("engine")[0]
        engine_type = engine_el.getAttribute('type')
        source_lang = engine_el.getAttribute('source-language')
        target_lang = engine_el.getAttribute('target-language')

        # create and return a new engine with that name, source target and language target

        if engine_type == 'neural':
            from cli.mmt.neural import NeuralEngine
            return NeuralEngine(name,
                                source_lang,
                                target_lang,
                                bpe_symbols=None)
        else:
            from cli.mmt.phrasebased import PhraseBasedEngine
            return PhraseBasedEngine(name, source_lang, target_lang)
Beispiel #5
0
    def import_corpus(self,
                      domain_id,
                      corpus,
                      callback=None,
                      refresh_rate_in_seconds=1):
        if type(corpus) == TMXCorpus:
            job = self.api.import_into_domain(domain_id, tmx=corpus.get_tmx())
        elif type(corpus) == FileParallelCorpus:
            source_file = corpus.get_file(self.engine.source_lang)
            target_file = corpus.get_file(self.engine.target_lang)
            job = self.api.import_into_domain(domain_id,
                                              source_file=source_file,
                                              target_file=target_file)
        else:
            raise IllegalArgumentException('Invalid corpus type: ' +
                                           str(type(corpus)))

        if callback is not None:
            callback(job)

        while job['progress'] != 1.0:
            time.sleep(refresh_rate_in_seconds)
            job = self.api.get_import_job(job['id'])

            if callback is not None:
                callback(job)
Beispiel #6
0
    def nmt_tune(self,
                 corpora,
                 debug=False,
                 listener=None,
                 max_lines=None,
                 lr_delta=0.1,
                 max_epochs=10,
                 gpus=None):
        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        corpora = [
            corpus for corpus in corpora
            if source_lang in corpus.langs and target_lang in corpus.langs
        ]
        if len(corpora) == 0:
            raise IllegalArgumentException(
                'No %s > %s corpora found into specified path' %
                (source_lang, target_lang))

        if listener is None:
            listener = self.TuneListener()

        listener.on_tuning_begin(corpora, self, 2)

        working_dir = self.engine.get_tempdir('tuning')
        log_file = self.engine.get_logfile('nmt_tune')

        try:
            content = []

            with listener.step('Corpora pre-processing'):
                validation_corpora_path = os.path.join(working_dir,
                                                       'valid_set')
                validation_corpora, _ = self.engine.training_preprocessor.process(
                    corpora, validation_corpora_path)

                for corpus in validation_corpora:
                    with corpus.reader([source_lang, target_lang]) as reader:
                        for source, target in reader:
                            content.append((source.strip(), target.strip()))

                if 0 < max_lines < len(content):
                    random.shuffle(content)
                    content = content[:max_lines]

            with listener.step('Tuning'):
                bleu_score = self.engine.tune(content,
                                              working_dir,
                                              lr_delta=lr_delta,
                                              max_epochs=max_epochs,
                                              log_file=log_file,
                                              gpus=gpus)

            listener.on_tuning_end(self, bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir("tuning")
Beispiel #7
0
    def rename_domain(self, domain, name):
        try:
            domain = int(domain)
        except ValueError:
            domains = self.api.get_all_domains()
            ids = [d['id'] for d in domains if d['name'] == domain]

            if len(ids) == 0:
                raise IllegalArgumentException('unable to find domain "' +
                                               domain + '"')
            elif len(ids) > 1:
                raise IllegalArgumentException(
                    'ambiguous domain name "' + domain +
                    '", choose one of the following ids: ' + str(ids))
            else:
                domain = ids[0]

        return self.api.rename_domain(domain, name)
Beispiel #8
0
    def __parse_context_map(text):
        context = []

        try:
            for score in text.split(','):
                name, value = score.split(':', 2)
                value = float(value)

                context.append({'id': name, 'score': value})
        except ValueError:
            raise IllegalArgumentException('invalid context weights map: ' +
                                           text)

        return context
Beispiel #9
0
        def __init__(self, plan, filtered_steps=None):
            self._plan = plan
            self._passed_steps = []

            all_steps = self.all_steps()

            if filtered_steps is not None:
                self._scheduled_steps = filtered_steps

                unknown_steps = [step for step in self._scheduled_steps if step not in all_steps]
                if len(unknown_steps) > 0:
                    raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps))
            else:
                self._scheduled_steps = all_steps
Beispiel #10
0
    def __parse_context_vector(text):
        context = []

        try:
            for score in text.split(','):
                id, value = score.split(':', 2)
                value = float(value)

                context.append({'domain': int(id), 'score': value})
        except ValueError:
            raise IllegalArgumentException('invalid context weights map: ' +
                                           text)

        return context
Beispiel #11
0
    def load(name):
        # figure the configuration file path from the engine name
        config_path = MMTEngine._get_config_path(name)

        if not os.path.isfile(config_path):
            raise IllegalArgumentException("Engine '%s' not found" % name)

        # parse the source language and target language from the configuration file
        engine_el = minidom.parse(
            config_path).documentElement.getElementsByTagName("engine")[0]
        source_lang = engine_el.getAttribute('source-language')
        target_lang = engine_el.getAttribute('target-language')

        # create and return a new engine with that name, source target and language target
        return MMTEngine(name, source_lang, target_lang)
Beispiel #12
0
    def create_domain(self,
                      tmx=None,
                      source_file=None,
                      target_file=None,
                      name=None):
        if source_file is not None and target_file is not None:
            params = {
                'source_local_file': source_file,
                'target_local_file': target_file
            }
        elif tmx is not None:
            params = {'tmx_local_file': tmx}
        else:
            raise IllegalArgumentException('missing corpus for domain')

        if name is not None:
            params['name'] = name

        return self._post('domains', params=params)
Beispiel #13
0
    def build(self, roots, debug=False, steps=None, split_trainingset=True):
        self._temp_dir = self._engine.get_tempdir('training', ensure=True)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(
            source_lang, target_lang, roots=roots)

        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' %
                (source_lang.upper(), target_lang.upper()))

        if steps is None:
            steps = self._engine.training_steps
        else:
            unknown_steps = [
                step for step in steps
                if step not in self._engine.training_steps
            ]
            if len(unknown_steps) > 0:
                raise IllegalArgumentException('Unknown training steps: ' +
                                               str(unknown_steps))

        shutil.rmtree(self._engine.path, ignore_errors=True)
        os.makedirs(self._engine.path)

        # Check disk space constraints
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self.__GB * corpus_size_on_disk / (
            350 * self.__MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \
                      (recommended_mem / self.__GB, free_memory / self.__GB)
            if free_space_on_disk < recommended_disk:
                print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \
                      (recommended_disk / self.__GB, free_space_on_disk / self.__GB)
            print

        logger = _builder_logger(
            len(steps) + 1, self._engine.get_logfile('training'))

        try:
            logger.start(self._engine, bilingual_corpora, monolingual_corpora)

            unprocessed_bicorpora = bilingual_corpora
            unprocessed_monocorpora = monolingual_corpora

            # TM draft-translations cleanup
            if 'tm_cleanup' in steps:
                with logger.step('TMs clean-up') as _:
                    unprocessed_bicorpora = self._engine.cleaner.clean(
                        unprocessed_bicorpora,
                        self._get_tempdir('clean_tms'),
                        log=logger.stream)

            cleaned_bicorpora = unprocessed_bicorpora
            processed_bicorpora = unprocessed_bicorpora
            processed_monocorpora = unprocessed_monocorpora

            # Preprocessing
            if 'preprocess' in steps:
                with logger.step('Corpora preprocessing') as _:
                    unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate(
                        unprocessed_bicorpora,
                        unprocessed_monocorpora,
                        self._get_tempdir('training_corpora'),
                        log=logger.stream)

                    processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                        unprocessed_bicorpora + unprocessed_monocorpora,
                        self._get_tempdir('preprocessed'),
                        (self._engine.data_path
                         if split_trainingset else None),
                        log=logger.stream)

                    cleaned_bicorpora = self._engine.training_preprocessor.clean(
                        processed_bicorpora,
                        self._get_tempdir('clean_corpora'))

            # Training Context Analyzer
            if 'context_analyzer' in steps:
                with logger.step('Context Analyzer training') as _:
                    self._engine.analyzer.create_index(unprocessed_bicorpora,
                                                       log=logger.stream)

            # Aligner
            if 'aligner' in steps:
                with logger.step('Aligner training') as _:
                    working_dir = self._get_tempdir('aligner')
                    self._engine.aligner.build(cleaned_bicorpora,
                                               working_dir,
                                               log=logger.stream)

            # Training Translation Model
            if 'tm' in steps:
                with logger.step('Translation Model training') as _:
                    working_dir = self._get_tempdir('tm')
                    self._engine.pt.train(cleaned_bicorpora,
                                          self._engine.aligner,
                                          working_dir,
                                          log=logger.stream)

            # Training Adaptive Language Model
            if 'lm' in steps:
                with logger.step('Language Model training') as _:
                    working_dir = self._get_tempdir('lm')
                    self._engine.lm.train(processed_bicorpora +
                                          processed_monocorpora,
                                          target_lang,
                                          working_dir,
                                          log=logger.stream)

            # Writing config file
            with logger.step('Writing config files') as _:
                self._engine.write_configs()

            logger.completed()
        except:
            logger.error()
            raise
        finally:
            logger.close()
            if not debug:
                self._engine.clear_tempdir('training')
Beispiel #14
0
    def tune(self,
             corpora=None,
             debug=False,
             context_enabled=True,
             random_seeds=False,
             max_iterations=25,
             early_stopping_value=None):
        if corpora is None:
            corpora = BilingualCorpus.list(
                os.path.join(self.engine.data_path,
                             TrainingPreprocessor.DEV_FOLDER_NAME))

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        corpora = [
            corpus for corpus in corpora
            if source_lang in corpus.langs and target_lang in corpus.langs
        ]
        if len(corpora) == 0:
            raise IllegalArgumentException(
                'No %s > %s corpora found into specified path' %
                (source_lang, target_lang))

        source_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [source_lang]) for corpus in corpora
        ]
        reference_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [target_lang]) for corpus in corpora
        ]

        cmdlogger = _tuning_logger(4)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenizer = Tokenizer(target_lang)
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with cmdlogger.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(
                    reference_corpora, tokenized_output)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + target_lang)
                fileutils.merge([
                    corpus.get_file(target_lang)
                    for corpus in reference_corpora
                ], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if self.api.root is not None:
                    decoder_flags += ['--root', self.api.root]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [
                        self._mert_script, source_merged_corpus,
                        target_merged_corpus, self._mert_i_script,
                        runtime_moses_ini.name, '--threads',
                        str(multiprocessing.cpu_count()), '--mertdir',
                        cli.BIN_DIR, '--mertargs',
                        '\'--binary --sctype BLEU\'', '--working-dir', mert_wd,
                        '--nbest', '100', '--decoder-flags',
                        '"' + ' '.join(decoder_flags) + '"', '--nonorm',
                        '--closest', '--no-filter-phrase-table'
                    ]

                    if early_stopping_value is not None:
                        command += [
                            '--bleuscorer', self._scorer_script,
                            '--bleuscorer-flags "-nt" --early-stopping-value %d'
                            % early_stopping_value
                        ]

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(
                            num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command),
                                      stdout=log,
                                      stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [
                                float(val) for val in tokens[1:]
                            ]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir("tuning")
Beispiel #15
0
    def translate(self, corpora, dest_path=None, debug=False):
        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if dest_path:
            fileutils.makedirs(dest_path, exist_ok=True)

        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        working_dir = self._engine.get_tempdir('evaluation')
        have_references = False

        try:
            results = []

            # Process references
            corpora_path = os.path.join(working_dir, 'corpora')
            corpora = self._xmlencoder.encode(corpora, corpora_path)

            reference = os.path.join(working_dir, 'reference.' + target_lang)
            source = os.path.join(working_dir, 'source.' + source_lang)
            refs = [corpus.get_file(target_lang) for corpus in corpora if corpus.get_file(target_lang)]
            have_references = len(refs) > 0
            fileutils.merge(refs, reference)  # tolerates missing reference
            fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

            if dest_path:
                for corpus in corpora:
                    corpus.copy(dest_path, suffixes={source_lang: '.src', target_lang: '.ref', 'tmx': '.src'})

            # Translate
            translator = self._translator
            name = translator.name()

            result = _EvaluationResult(translator)
            results.append(result)

            translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
            xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
            fileutils.makedirs(translations_path, exist_ok=True)

            try:
                translated, mtt, parallelism = translator.translate(corpora, translations_path)
                filename = result.id + '.' + target_lang

                result.mtt = mtt
                result.parallelism = parallelism
                result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                result.merge = os.path.join(working_dir, filename)

                fileutils.merge([corpus.get_file(target_lang)
                                 for corpus in result.translated_corpora], result.merge)

                if dest_path:
                    for corpus in result.translated_corpora:
                        corpus.copy(dest_path, suffixes={target_lang: '.hyp', 'tmx': '.hyp'})

            except TranslateError as e:
                result.error = e
            except Exception as e:
                result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            if result.error is None:
                if have_references:
                    scorer = BLEUScore()
                    # bleu in range [0;1)
                    bleu = scorer.calculate(result.merge, reference)
                    return bleu
                else:
                    return True
            else:
                print(result.error)
                return None
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')
Beispiel #16
0
    def evaluate(self, corpora, heval_output=None, debug=False):
        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs]
        if len(corpora) == 0:
            raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang))

        if heval_output is not None:
            fileutils.makedirs(heval_output, exist_ok=True)

        logger = _evaluate_logger()
        logger.start(corpora)

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            results = []

            # Process references
            with logger.step('Preparing corpora') as _:
                corpora_path = os.path.join(working_dir, 'corpora')
                corpora = self._xmlencoder.encode(corpora, corpora_path)

                reference = os.path.join(working_dir, 'reference.' + target_lang)
                source = os.path.join(working_dir, 'source.' + source_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference)
                fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

                if heval_output is not None:
                    self._heval_outputter.write(lang=target_lang, input_file=reference,
                                                output_file=os.path.join(heval_output, 'reference.' + target_lang))
                    self._heval_outputter.write(lang=source_lang, input_file=source,
                                                output_file=os.path.join(heval_output, 'source.' + source_lang))

            # Translate
            for translator in self._translators:
                name = translator.name()

                with logger.step('Translating with %s' % name) as _:
                    result = _EvaluationResult(translator)
                    results.append(result)

                    translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
                    xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
                    fileutils.makedirs(translations_path, exist_ok=True)

                    try:
                        translated, mtt, parallelism = translator.translate(corpora, translations_path)
                        filename = result.id + '.' + target_lang

                        result.mtt = mtt
                        result.parallelism = parallelism
                        result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                        result.merge = os.path.join(working_dir, filename)

                        fileutils.merge([corpus.get_file(target_lang)
                                         for corpus in result.translated_corpora], result.merge)

                        if heval_output is not None:
                            self._heval_outputter.write(lang=target_lang, input_file=result.merge,
                                                        output_file=os.path.join(heval_output, filename))
                    except TranslateError as e:
                        result.error = e
                    except Exception as e:
                        result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            # Check corpora length
            reference_lines = fileutils.linecount(reference)
            for result in results:
                if result.error is not None:
                    continue

                lines = fileutils.linecount(result.merge)

                if lines != reference_lines:
                    raise TranslateError('Invalid line count for translator %s: expected %d, found %d.'
                                         % (result.translator.name(), reference_lines, lines))

            # Scoring
            scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')]

            for scorer, field in scorers:
                with logger.step('Calculating %s' % scorer.name()) as _:
                    for result in results:
                        if result.error is not None:
                            continue
                        setattr(result, field, scorer.calculate(result.merge, reference))

            logger.completed(results, scorers)

            return results
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')
Beispiel #17
0
    def _build(self, resume=False):

        self._temp_dir = self._engine.get_tempdir('training',
                                                  ensure=(not resume))
        self._checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json')
        self._passed_steps = []

        if resume:
            self.load_checkpoint()
        else:
            self.save_checkpoint()

        # initialize thee checkpoint manager
        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        # separate bilingual and monolingual corpora in separate lists, reading them from roots
        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(
            source_lang, target_lang, roots=self._roots)
        # if no bilingual corpora are found, it is not possible to train the translation system
        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' %
                (source_lang.upper(), target_lang.upper()))

        # if no old engines (i.e. engine folders) can be found, create a new one from scratch
        # if we are not trying to resume an old one, create from scratch anyway
        if not os.path.isdir(self._engine.path) or not resume:
            shutil.rmtree(self._engine.path, ignore_errors=True)
            os.makedirs(self._engine.path)

        # Check if all requirements are fulfilled before launching engine training
        self._check_constraints()

        # Create a new logger for the building activities,
        # passing it the amount of steps to perform (plus a non user-decidable step)
        # and the name of the log file to create
        logger = _builder_logger(
            len(self._scheduled_steps) + 1,
            self._engine.get_logfile('training'))
        delete_on_exit = not self._debug
        # Start the engine building (training) phases
        try:
            # tell the logger that the engine training has started
            logger.start(self._engine, bilingual_corpora, monolingual_corpora)

            # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~
            # Note: if resume is true, a step is only run if it was not in the previous attempt

            # run tm_cleanup step on the bilingual_corpora if required.
            # Obtain cleaned bicorpora
            cleaned_bicorpora = self._run_step('tm_cleanup',
                                               self._step_tm_cleanup,
                                               logger=logger,
                                               values=[bilingual_corpora],
                                               delete_on_exit=delete_on_exit)

            # run __db_map step (always: user can't skip it)
            # on the cleaned bicorpora and the original monocorpora;
            # obtain base bicorpora and base monocorpora
            base_bicorpora, base_monocorpora = self._run_step(
                '__db_map',
                self._step_init,
                forced=True,
                values=[cleaned_bicorpora, monolingual_corpora],
                delete_on_exit=delete_on_exit)

            # run preprocess step if required.
            # Return processed bi and mono corpora and cleaned bicorpora
            processed_bicorpora, processed_monocorpora, cleaned_bicorpora = \
                self._run_step('preprocess',
                               self._step_preprocess,
                               logger=logger,
                               values=[base_bicorpora, base_monocorpora, base_bicorpora],
                               delete_on_exit=delete_on_exit)

            # run context_analyzer step base_bicorpora if required.
            _ = self._run_step('context_analyzer',
                               self._step_context_analyzer,
                               logger=logger,
                               values=[base_bicorpora],
                               delete_on_exit=delete_on_exit)

            # run aligner step cleaned_bicorpora if required.
            _ = self._run_step('aligner',
                               self._step_aligner,
                               logger=logger,
                               values=[cleaned_bicorpora],
                               delete_on_exit=delete_on_exit)

            # run tm step cleaned_bicorpora if required.
            _ = self._run_step('tm',
                               self._step_tm,
                               logger=logger,
                               values=[cleaned_bicorpora],
                               delete_on_exit=delete_on_exit)

            # run lm step on the joint list of processed_bicorpora and processed_monocorpora
            _ = self._run_step(
                'lm',
                self._step_lm,
                logger=logger,
                values=[processed_bicorpora + processed_monocorpora],
                delete_on_exit=delete_on_exit)

            # Writing config file
            with logger.step('Writing config files') as _:
                self._engine.write_configs()

            # tell the logger that the engine training has completed
            logger.completed()

            # if this is not debug mode, then the training temporary folder must be deleted
            if not self._debug:
                self._engine.clear_tempdir('training')
        except:
            logger.error()
            raise
        finally:
            logger.close()
Beispiel #18
0
    def evaluate(self, corpora, heval_output=None, debug=False):
        corpora = [
            corpus for corpus in corpora if self._source_lang in corpus.langs
            and self._target_lang in corpus.langs
        ]
        if len(corpora) == 0:
            raise IllegalArgumentException(
                'No %s > %s corpora found into specified path' %
                (self._source_lang, self._target_lang))

        print '\n============== EVALUATION ==============\n'
        print 'Testing on %d lines:\n' % sum(
            [corpus.count_lines() for corpus in corpora])

        if heval_output is not None:
            osutils.makedirs(heval_output, exist_ok=True)

        step_logger = _StepLogger()
        human_eval_outputter = HumanEvaluationFileOutputter(
        ) if heval_output is not None else None

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            # Process references
            with step_logger.step('Preparing corpora') as _:
                source = os.path.join(working_dir,
                                      'source.' + self._source_lang)
                osutils.concat(
                    [corpus.get_file(self._source_lang) for corpus in corpora],
                    source)

                reference = os.path.join(working_dir,
                                         'reference.' + self._target_lang)
                osutils.concat(
                    [corpus.get_file(self._target_lang) for corpus in corpora],
                    reference + '.tmp')
                XMLEncoder().encode_file(reference + '.tmp', reference)
                os.remove(reference + '.tmp')

                if human_eval_outputter is not None:
                    human_eval_outputter.write(lang=self._target_lang,
                                               input_file=reference,
                                               output_file=os.path.join(
                                                   heval_output, 'reference.' +
                                                   self._target_lang))
                    human_eval_outputter.write(lang=self._source_lang,
                                               input_file=source,
                                               output_file=os.path.join(
                                                   heval_output, 'source.' +
                                                   self._source_lang))

                total_line_count = osutils.lc(reference)

            # Translate
            entries = []
            for translator in self._translators:
                with step_logger.step('Translating with %s' %
                                      translator.name) as _:
                    entry = self._translate_with(translator, corpora,
                                                 working_dir, total_line_count)
                    entries.append(entry)

                    if entry.error is None and human_eval_outputter is not None:
                        human_eval_file = os.path.join(
                            heval_output,
                            os.path.basename(entry.translation_file))
                        human_eval_outputter.write(
                            lang=self._target_lang,
                            input_file=entry.translation_file,
                            output_file=human_eval_file)

            # Scoring
            for scorer in self._scorers:
                with step_logger.step('Calculating %s' % scorer.name()) as _:
                    for entry in entries:
                        if entry.error is not None:
                            continue
                        try:
                            entry.scores[scorer] = scorer.calculate(
                                entry.translation_file, reference)
                        except Exception as e:
                            entry.scores[scorer] = str(e)

            # Print results
            print '\n=============== RESULTS ================\n'

            for scorer in self._scorers:
                print scorer.name() + ':'

                for i, entry in enumerate(
                        sorted(entries,
                               key=lambda x: x.scores[scorer]
                               if x.error is None else 0,
                               reverse=True)):
                    if entry.error is None:
                        value = entry.scores[scorer]
                        if isinstance(value, basestring):
                            text = value
                        else:
                            text = '%.2f' % (value * 100)
                            if i == 0:
                                text += ' (Winner)'
                    else:
                        text = str(entry.error)

                    print '  %s: %s' % (entry.translator.name.ljust(20), text)
                print

            print 'Translation Speed:'
            for entry in sorted(entries,
                                key=lambda x: x.translation_time
                                if x.error is None else float('inf')):
                if entry.error is None:
                    text = '%.2fs per sentence' % entry.translation_time
                else:
                    text = str(entry.error)

                print '  %s: %s' % (entry.translator.name.ljust(20), text)
            print
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')