Beispiel #1
0
    def _prepare_training_data(self, opt, skip=False):

        training_corpora = BilingualCorpus.list(opt.training_corpora)
        valid_corpora = BilingualCorpus.list(opt.valid_corpora)
        self._logger.log(self._log_level, 'training corpora:%s' % repr(training_corpora))
        self._logger.log(self._log_level, 'validation corpora:%s' % repr(valid_corpora))

        if not skip:
            self.onmt_preprocessor.process(training_corpora, valid_corpora, opt.training_dir,
                                           bpe_symbols=opt.bpe_symbols, max_vocab_size=opt.bpe_max_vocab_size,
                                           working_dir=opt.training_dir)
Beispiel #2
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = BilingualCorpus.make_parallel(
                    corpus.name, dest_folder, [lang]).get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

        return BilingualCorpus.list(dest_folder)
Beispiel #3
0
    def process_corpora(self, corpora, output_folder):
        fileutils.makedirs(output_folder, exist_ok=True)

        for corpus in corpora:
            output_corpus = BilingualCorpus.make_parallel(
                corpus.name, output_folder, corpus.langs)

            for lang in corpus.langs:
                input_path = corpus.get_file(lang)
                output_path = output_corpus.get_file(lang)

                self.process_file(input_path, output_path, lang)

        return BilingualCorpus.list(output_folder)
Beispiel #4
0
    def clean(self, corpora, output_path, log=None):
        if log is None:
            log = shell.DEVNULL

        # read memory size
        mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf(
            'SC_PHYS_PAGES')  # e.g. 4015976448
        mem_mb = mem_bytes / (1024.**2)  # e.g. 3.74

        extended_heap_mb = int(mem_mb * 90 / 100)

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path, '--input'
        ]

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass,
                               args=args,
                               max_heap_mb=extended_heap_mb)
        shell.execute(command, stdout=log, stderr=log)

        return BilingualCorpus.list(output_path)
Beispiel #5
0
    def start(self, node, corpora, debug=False):
        if corpora is None:
            corpora = BilingualCorpus.list(
                os.path.join(node.engine.data_path,
                             TrainingPreprocessor.DEV_FOLDER_NAME))

        self._start_fn(node, corpora, self, debug)
Beispiel #6
0
    def train(self, corpora, aligner, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        fileutils.makedirs(self._model, exist_ok=True)

        train_corpora_path = os.path.join(working_dir, 'corpora')
        lex_model_path = os.path.join(working_dir, 'model.tlex')

        if not os.path.isdir(train_corpora_path):
            fileutils.makedirs(train_corpora_path, exist_ok=True)

        train_corpora = []  # Prepare training folder
        for corpus in corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, train_corpora_path,
                                                        (self._source_lang, self._target_lang))
            source_file = corpus.get_file(self._source_lang)
            target_file = corpus.get_file(self._target_lang)

            os.symlink(source_file, dest_corpus.get_file(self._source_lang))
            os.symlink(target_file, dest_corpus.get_file(self._target_lang))

            train_corpora.append(dest_corpus)

        # Align corpora
        aligner.align(train_corpora, train_corpora_path, log=log)
        aligner.export(lex_model_path)

        # Build models
        command = [self._build_bin, '--lex', lex_model_path, '--input', train_corpora_path, '--model', self._model,
                   '-s', self._source_lang, '-t', self._target_lang, '-v', self._vb.model]
        shell.execute(command, stdout=log, stderr=log)
Beispiel #7
0
    def _clean_tms(self, args, skip=False, log=None):
        folder = self._get_tempdir('clean_corpora')

        if skip:
            args.bilingual_corpora = BilingualCorpus.list(folder)
        else:
            args.bilingual_corpora = self._engine.cleaner.clean(args.bilingual_corpora, folder, log=log)
Beispiel #8
0
    def _preprocess(self, args, skip=False, log=None):
        preprocessed_folder = self._get_tempdir('preprocessed_corpora')

        if skip:
            processed_bicorpora, processed_monocorpora = BilingualCorpus.splitlist(
                self._engine.source_lang,
                self._engine.target_lang,
                roots=preprocessed_folder)
        else:
            corpora = args.bilingual_corpora + args.monolingual_corpora
            if not corpora:
                raise CorpusNotFoundInFolderException(
                    "Could not find any valid %s -> %s segments in your input."
                    % (self._engine.source_lang, self._engine.target_lang))

            processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                corpora,
                preprocessed_folder,
                data_path=(self._engine.data_path
                           if self._split_trainingset else None),
                vb_path=self._engine.vocabulary_path,
                log=log)

        args.processed_bilingual_corpora = processed_bicorpora
        args.processed_monolingual_corpora = processed_monocorpora
Beispiel #9
0
    def process(self,
                corpora,
                output_path,
                test_data_path=None,
                dev_data_path=None,
                log=None):
        if log is None:
            log = osutils.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path, '--input'
        ]

        for root in set([corpus.get_folder() for corpus in corpora]):
            args.append(root)

        if dev_data_path is not None:
            args.append('--dev')
            args.append(dev_data_path)
        if test_data_path is not None:
            args.append('--test')
            args.append(test_data_path)

        command = mmt_javamain(self._java_main, args)
        osutils.shell_exec(command, stdout=log, stderr=log)

        return BilingualCorpus.list(self._source_lang, self._target_lang,
                                    output_path)
Beispiel #10
0
    def build(self, corpora, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        merged_corpus = BilingualCorpus.make_parallel(
            'merge', working_dir, (self._source_lang, self._target_lang))

        fileutils.merge(
            [corpus.get_file(self._source_lang) for corpus in corpora],
            merged_corpus.get_file(self._source_lang))
        fileutils.merge(
            [corpus.get_file(self._target_lang) for corpus in corpora],
            merged_corpus.get_file(self._target_lang))

        command = [
            self._build_bin, '-s',
            merged_corpus.get_file(self._source_lang), '-t',
            merged_corpus.get_file(self._target_lang), '-m', self._model, '-I',
            '4'
        ]
        shell.execute(command, stdout=log, stderr=log)
Beispiel #11
0
    def _clean_tms(self, args, skip=False, log=None):
        folder = self._get_tempdir('clean_corpora')

        if skip:
            args.corpora = BilingualCorpus.list(self.source_lang,
                                                self.target_lang, folder)
        else:
            args.corpora = self._cleaner.clean(args.corpora, folder, log=log)
Beispiel #12
0
    def _reduce_train(self, args, skip=False, log=None):
        folder = self._get_tempdir('reduced_corpora')

        if skip:
            args.bilingual_corpora = BilingualCorpus.list(folder)
        else:
            args.bilingual_corpora = self._engine.training_preprocessor.reduce(
                args.bilingual_corpora, folder, self._max_training_words, log=log)
Beispiel #13
0
    def translate(self, corpora, output):
        """
        Translate the given corpora in parallel processing fashion.
        :param corpora: list of ParallelCorpus
        :param output:  path to output directory
        :return: ([ParallelCorpus, ...], time_per_sentence, parallelism)
        """
        pool = multithread.Pool(self._threads)

        try:
            translations = []
            start_time = datetime.now()

            for corpus in corpora:
                self._before_translate(corpus)

                with open(corpus.get_file(self.source_lang)) as source:
                    output_path = os.path.join(output, corpus.name + '.' + self.target_lang)

                    for line in source:
                        translation = pool.apply_async(self._get_translation, (line, corpus))
                        translations.append((translation, output_path))

                self._after_translate(corpus)

            elapsed_time = 0
            translation_count = 0

            path = None
            stream = None

            for translation_job, output_path in translations:
                translation, elapsed = translation_job.get()

                if output_path != path:
                    if stream is not None:
                        stream.close()

                    stream = open(output_path, 'wb')
                    path = output_path

                stream.write(translation.encode('utf-8'))
                stream.write('\n')

                elapsed_time += elapsed
                translation_count += 1

            if stream is not None:
                stream.close()

            end_time = datetime.now()
            total_time = end_time - start_time

            return BilingualCorpus.list(output), (elapsed_time / translation_count), (
                elapsed_time / total_time.total_seconds())
        finally:
            pool.terminate()
Beispiel #14
0
    def _create_db(self, args, skip=False):
        training_folder = self._get_tempdir('training_corpora')

        if skip:
            b, m = BilingualCorpus.splitlist(self._engine.source_lang, self._engine.target_lang, roots=training_folder)
        else:
            memories = self._engine.db.insert(args.bilingual_corpora)

            b = [memory.corpus.symlink(training_folder, name=str(memory.id)) for memory in memories]
            m = [corpus.symlink(training_folder) for corpus in args.monolingual_corpora]

        args.bilingual_corpora = b
        args.monolingual_corpora = m
Beispiel #15
0
    def _prepare_data(self, args, skip=False, log=None):
        args.prepared_data_path = self._get_tempdir('neural_train_data')

        if not skip:
            train_corpora = filter(
                None, [args.processed_train_corpora, args.corpora])[0]
            eval_corpora = args.processed_valid_corpora or BilingualCorpus.list(
                self.source_lang, self.target_lang, self._validation_path)
            self._decoder.prepare_data(train_corpora,
                                       eval_corpora,
                                       args.prepared_data_path,
                                       log=log,
                                       bpe_symbols=self._bpe_symbols,
                                       fromModel=self._fromModel)
Beispiel #16
0
    def _preprocess(self, args, skip=False, log=None):
        preprocessed_folder = self._get_tempdir('preprocessed_corpora')
        train_folder = os.path.join(preprocessed_folder, 'train')
        valid_folder = os.path.join(preprocessed_folder, 'validation')
        raw_valid_folder = os.path.join(preprocessed_folder,
                                        'extracted_validation')

        if skip:
            args.processed_train_corpora = BilingualCorpus.list(
                self.source_lang, self.target_lang, train_folder)
            args.processed_valid_corpora = BilingualCorpus.list(
                self.source_lang, self.target_lang, valid_folder)
        else:
            if not args.corpora:
                raise CorpusNotFoundInFolderException(
                    'Could not find any valid %s > %s segments in your input.'
                    % (self.source_lang, self.target_lang))

            test_data_path = self._engine.test_data_path if self._split_train else None
            dev_data_path = raw_valid_folder if self._split_train else None
            args.processed_train_corpora = self._training_preprocessor.process(
                args.corpora,
                train_folder,
                log=log,
                test_data_path=test_data_path,
                dev_data_path=dev_data_path)
            valid_corpora = BilingualCorpus.list(
                self.source_lang, self.target_lang, dev_data_path
                or self._validation_path)

            if not valid_corpora:
                raise CorpusNotFoundInFolderException(
                    'Could not find any valid %s > %s segments for validation.'
                    % (self.source_lang, self.target_lang))

            args.processed_valid_corpora = self._training_preprocessor.process(
                valid_corpora, valid_folder, log=log)
Beispiel #17
0
    def reduce(self, corpora, output_path, word_limit, log=None):
        if log is None:
            log = shell.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--words',
            str(word_limit), '--output', output_path, '--input'
        ]

        for root in set([corpus.get_folder() for corpus in corpora]):
            args.append(root)

        command = mmt_javamain(self._reduce_mainclass, args=args)
        shell.execute(command, stdout=log, stderr=log)

        return BilingualCorpus.list(output_path)
Beispiel #18
0
    def _prepare_training_data(self, args, skip=False, delete_on_exit=False):
        args.onmt_training_path = self._get_tempdir('onmt_training')

        if not skip:
            processed_valid_path = os.path.join(args.onmt_training_path, 'processed_valid')

            validation_corpora = BilingualCorpus.list(self._valid_corpora_path)
            validation_corpora, _ = self._engine.training_preprocessor.process(validation_corpora, processed_valid_path)

            corpora = filter(None, [args.processed_bilingual_corpora, args.bilingual_corpora])[0]

            self._engine.nmt_preprocessor.process(corpora, validation_corpora, args.onmt_training_path,
                                                  checkpoint=self._checkpoint)

            if delete_on_exit:
                shutil.rmtree(processed_valid_path, ignore_errors=True)
Beispiel #19
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            osutils.makedirs(dest_folder, exist_ok=True)

        out_corpora = []
        for corpus in corpora:
            out_corpus = BilingualCorpus.make_parallel(corpus.name,
                                                       dest_folder,
                                                       corpus.langs)

            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = out_corpus.get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

            out_corpora.append(out_corpus)

        return out_corpora
Beispiel #20
0
    def clean(self, corpora, output_path, log=None):
        if log is None:
            log = osutils.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path, '--input'
        ]

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        extended_heap_mb = int(osutils.mem_size() * 90 / 100)

        command = mmt_javamain(self._java_main,
                               args=args,
                               max_heap_mb=extended_heap_mb)
        osutils.shell_exec(command, stdout=log, stderr=log)

        return BilingualCorpus.list(self._source_lang, self._target_lang,
                                    output_path)
Beispiel #21
0
    def process(self,
                corpora,
                output_path,
                data_path=None,
                log=None,
                vb_path=None):
        if log is None:
            log = shell.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path
        ]
        if vb_path:
            args.append('-v')
            args.append(vb_path)

        args.append('--input')
        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._process_mainclass, args)
        shell.execute(command, stdout=log, stderr=log)

        return BilingualCorpus.splitlist(self._source_lang,
                                         self._target_lang,
                                         roots=output_path)
Beispiel #22
0
    def phrase_based_tune(self, corpora, debug=False, listener=None,
                          context_enabled=True, random_seeds=False, max_iterations=25, early_stopping_value=None):
        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs]
        if len(corpora) == 0:
            raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang))

        source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang])
                          for corpus in corpora]
        reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang])
                             for corpus in corpora]

        if listener is None:
            listener = self.TuneListener()

        listener.on_tuning_begin(corpora, self, 4)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenizer = Tokenizer(source_lang=source_lang, target_lang=target_lang,
                                  print_placeholders=False, print_tags=False)
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with listener.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(corpora=reference_corpora, output_folder=tokenized_output)

            # Create merged corpus
            with listener.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus)

            # Run MERT algorithm
            with listener.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port), '--source', source_lang, '--target', target_lang]

                if self.api.root is not None:
                    decoder_flags += ['--root', self.api.root]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [self._mert_script, source_merged_corpus, target_merged_corpus,
                               self._mert_i_script, runtime_moses_ini.name, '--threads',
                               str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR,
                               '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100',
                               '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest',
                               '--no-filter-phrase-table']

                    if early_stopping_value is not None:
                        command += ['--bleuscorer', self._scorer_script,
                                    '--bleuscorer-flags "-nt" --early-stopping-value %d' % early_stopping_value]

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command), stdout=log, stderr=log)

            # Read optimized configuration
            with listener.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            listener.on_tuning_end(self, bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir("tuning")
Beispiel #23
0
    def _build(self, resume, listener):
        self._temp_dir = self._engine.get_tempdir('training',
                                                  ensure=(not resume))

        checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json')
        if resume:
            self._schedule.load(checkpoint_path)
        else:
            self._schedule.store(checkpoint_path)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        # separate bilingual and monolingual corpora in separate lists, reading them from roots
        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(
            source_lang, target_lang, roots=self._roots)

        # if no bilingual corpora are found, it is not possible to train the translation system
        if len(bilingual_corpora) == 0:
            raise CorpusNotFoundInFolderException(
                'Could not find %s-%s corpora in path %s' %
                (source_lang.upper(), target_lang.upper(), ', '.join(
                    self._roots)))

        # if no old engines (i.e. engine folders) can be found, create a new one from scratch
        # if we are not trying to resume an old one, create from scratch anyway
        if not os.path.isdir(self._engine.path) or not resume:
            shutil.rmtree(self._engine.path, ignore_errors=True)
            os.makedirs(self._engine.path)

        # Create a new logger for the building activities,
        log_file = self._engine.get_logfile('training', append=resume)
        log_stream = open(log_file, 'ab' if resume else 'wb')
        logging.basicConfig(
            format='%(asctime)-15s [%(levelname)s] - %(message)s',
            level=logging.DEBUG,
            stream=log_stream)
        logger = logging.getLogger('EngineBuilder')

        # Start the engine building (training) phases
        try:
            logger.log(
                logging.INFO,
                'Training started: engine=%s, bilingual=%d, monolingual=%d, langpair=%s-%s'
                % (self._engine.name, len(bilingual_corpora),
                   len(monolingual_corpora), self._engine.source_lang,
                   self._engine.target_lang))

            if listener:
                listener.on_training_begin(self._schedule.visible_steps(),
                                           self._engine, bilingual_corpora,
                                           monolingual_corpora)

            # Check if all requirements are fulfilled before actual engine training
            try:
                self._check_constraints()
            except EngineBuilder.HWConstraintViolated as e:
                if listener:
                    listener.on_hw_constraint_violated(e.cause)

            args = EngineBuilder.__Args()
            args.bilingual_corpora = bilingual_corpora
            args.monolingual_corpora = monolingual_corpora

            # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~
            # Note: if resume is true, a step is only run if it was not in the previous attempt

            step_index = 1

            for method in self._schedule:
                skip = self._schedule.is_completed(method.id)

                if listener and not method.is_hidden():
                    listener.on_step_begin(method.id, method.name)

                logger.log(
                    logging.INFO, 'Training step "%s" (%d/%d) started' %
                    (method.id, step_index, len(self._schedule)))

                start_time = time.time()
                method(self,
                       args,
                       skip=skip,
                       log=log_stream,
                       delete_on_exit=self._delete_on_exit)
                elapsed_time = time.time() - start_time

                if listener and not method.is_hidden():
                    listener.on_step_end(method.id, method.name)

                logger.log(
                    logging.INFO, 'Training step "%s" completed in %d s' %
                    (method.id, int(elapsed_time)))

                self._schedule.step_completed(method.id)
                self._schedule.store(checkpoint_path)

                step_index += 1

            if listener:
                listener.on_training_end(self._engine)

            if self._delete_on_exit:
                self._engine.clear_tempdir('training')
        except:
            logger.exception('Unexpected exception')
            raise
        finally:
            log_stream.close()
Beispiel #24
0
def main_sweep(argv):
    parser = argparse.ArgumentParser(
        description=
        'Sweep SA sample size and measure BLEU scores at various settings.')
    parser.add_argument(
        '-e',
        '--engine',
        dest='engine',
        help='the engine name, \'default\' will be used if absent',
        default=None)
    parser.add_argument(
        '--path',
        dest='corpora_path',
        metavar='CORPORA',
        default=None,
        help=
        'the path to the test corpora (default is the automatically splitted sample)'
    )
    args = parser.parse_args(argv)

    samples = [
        int(e) for e in
        '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'.
        split()
    ]

    injector = dependency.DependencyInjector()
    #injector.read_args(args)
    engine = MMTEngine(args.engine)
    injector.inject(engine)

    node = ClusterNode(engine, api_port=DEFAULT_MMT_API_PORT)

    # more or less copy-pasted from mmt evaluate:

    evaluator = Evaluator(node, google_key='1234', use_sessions=True)

    corpora = BilingualCorpus.list(args.corpora_path) if args.corpora_path is not None \
        else BilingualCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

    lines = 0
    for corpus in corpora:
        lines += corpus.count_lines()

    # end copy-paste

    print('sample bleu')

    for sample in samples:
        node.engine.set_config_option('suffixarrays', 'sample', sample)
        injector.read_config(
            node.engine.config
        )  # to get engine.set() to affect MosesFeatures -> moses.ini
        injector.inject(node.engine)
        node.engine.write_configs()
        node.restart()

        scores = evaluator.evaluate(corpora=corpora,
                                    heval_output=None,
                                    debug=False)

        engine_scores = [r for r in scores if r.id == 'MMT'][0]

        if engine_scores.error:
            raise RuntimeError(engine_scores.error)

        bleu = engine_scores.bleu
        print(sample, '%.2f' % (bleu * 100))
Beispiel #25
0
    def _build(self, resume):
        self._temp_dir = self._engine.get_tempdir('training',
                                                  ensure=(not resume))

        checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json')
        if resume:
            self._schedule.load(checkpoint_path)
        else:
            self._schedule.store(checkpoint_path)

        corpora = BilingualCorpus.list(self.source_lang, self.target_lang,
                                       self.roots)

        if len(corpora) == 0:
            raise CorpusNotFoundInFolderException(
                'Could not find %s > %s corpora in path %s' %
                (self.source_lang, self.target_lang, ', '.join(self.roots)))

        # if no old engines (i.e. engine folders) can be found, create a new one from scratch
        # if we are not trying to resume an old one, create from scratch anyway
        if not os.path.isdir(self._engine.path) or not resume:
            shutil.rmtree(self._engine.path, ignore_errors=True)
            os.makedirs(self._engine.path)

        # Create a new logger for the building activities,
        log_file = self._engine.get_logfile('training', append=resume)
        log_stream = open(log_file, 'ab' if resume else 'wb')
        logging.basicConfig(
            format='%(asctime)-15s [%(levelname)s] - %(message)s',
            level=logging.DEBUG,
            stream=log_stream)
        logger = logging.getLogger('EngineBuilder')

        # Start the engine building (training) phases
        steps_count = len(self._schedule.visible_steps())
        log_line_len = 70

        try:
            logger.log(
                logging.INFO,
                'Training started: engine=%s, corpora=%d, lang_pair=%s-%s' %
                (self._engine.name, len(corpora), self.source_lang,
                 self.target_lang))

            print '\n=========== TRAINING STARTED ===========\n'
            print 'ENGINE:  %s' % self._engine.name
            print 'CORPORA: %d corpora' % len(corpora)
            print 'LANGS:   %s > %s' % (self.source_lang, self.target_lang)
            print

            # Check if all requirements are fulfilled before actual engine training
            try:
                self._check_constraints()
            except EngineBuilder.HWConstraintViolated as e:
                print '\033[91mWARNING\033[0m: %s\n' % e.cause

            args = EngineBuilder.__Args()
            args.corpora = corpora

            # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~
            # Note: if resume is true, a step is only run if it was not in the previous attempt

            step_index = 1

            for method in self._schedule:
                if not method.is_hidden():
                    print('INFO: (%d of %d) %s... ' %
                          (step_index, steps_count,
                           method.name)).ljust(log_line_len),

                skip = self._schedule.is_completed(method.id)
                self._step_start_time = time.time()

                logger.log(
                    logging.INFO, 'Training step "%s" (%d/%d) started' %
                    (method.id, step_index, len(self._schedule)))

                start_time = time.time()
                method(self,
                       args,
                       skip=skip,
                       log=log_stream,
                       delete_on_exit=self._delete_on_exit)
                elapsed_time_str = self._pretty_print_time(time.time() -
                                                           start_time)

                if not method.is_hidden():
                    step_index += 1
                    print 'DONE (in %s)' % elapsed_time_str

                logger.log(
                    logging.INFO, 'Training step "%s" completed in %s' %
                    (method.id, elapsed_time_str))

                self._schedule.step_completed(method.id)
                self._schedule.store(checkpoint_path)

            print '\n=========== TRAINING SUCCESS ===========\n'
            print 'You can now start, stop or check the status of the server with command:'
            print '\t./mmt start|stop|status ' + ('' if self._engine.name
                                                  == 'default' else '-e %s' %
                                                  self._engine.name)
            print

            if self._delete_on_exit:
                self._engine.clear_tempdir('training')
        except Exception:
            logger.exception('Unexpected exception')
            raise
        finally:
            log_stream.close()