Ejemplo n.º 1
0
    def process(self,
                source,
                target,
                input_paths,
                output_path,
                data_path=None):
        args = ['-s', source, '-t', target, '--output', output_path, '--input']

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command,
                      stdin=shell.DEVNULL,
                      stdout=shell.DEVNULL,
                      stderr=shell.DEVNULL)

        return ParallelCorpus.splitlist(source, target, roots=output_path)
Ejemplo n.º 2
0
    def clean(self, source, target, input_paths, output_path):
        args = ['-s', source, '-t', target, '--output', output_path, '--input']

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command,
                      stdin=shell.DEVNULL,
                      stdout=shell.DEVNULL,
                      stderr=shell.DEVNULL)

        return ParallelCorpus.splitlist(source, target, roots=output_path)[0]
Ejemplo n.º 3
0
    def build(self, roots, debug=False, steps=None, split_trainingset=True):
        self._temp_dir = self._engine.get_tempdir('training', ensure=True)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        bilingual_corpora, monolingual_corpora = ParallelCorpus.splitlist(source_lang, target_lang, roots=roots)

        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper()))

        if steps is None:
            steps = self._engine.training_steps
        else:
            unknown_steps = [step for step in steps if step not in self._engine.training_steps]
            if len(unknown_steps) > 0:
                raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps))

        cmdlogger = _builder_logger(len(steps) + 1)
        cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora)

        shutil.rmtree(self._engine.path, ignore_errors=True)
        os.makedirs(self._engine.path)

        # Check disk space constraints
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self.__GB * corpus_size_on_disk / (350 * self.__MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \
                      (recommended_mem / self.__GB, free_memory / self.__GB)
            if free_space_on_disk < recommended_disk:
                print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \
                      (recommended_disk / self.__GB, free_space_on_disk / self.__GB)
            print

        try:
            corpora_roots = roots

            unprocessed_bicorpora = bilingual_corpora
            unprocessed_mocorpora = monolingual_corpora

            # TM cleanup
            if 'tm_cleanup' in steps:
                with cmdlogger.step('TMs clean-up') as _:
                    cleaned_output = self._get_tempdir('clean_tms')
                    self._engine.cleaner.clean(source_lang, target_lang, roots, cleaned_output)

                    for corpus in monolingual_corpora:
                        cfile = corpus.get_file(target_lang)
                        link = os.path.join(cleaned_output, os.path.basename(cfile))
                        os.symlink(cfile, link)

                    corpora_roots = [cleaned_output]
                    unprocessed_bicorpora, unprocessed_mocorpora = ParallelCorpus.splitlist(source_lang, target_lang,
                                                                                            roots=corpora_roots)

            # Preprocessing
            processed_bicorpora = unprocessed_bicorpora
            processed_mocorpora = unprocessed_mocorpora

            if 'preprocess' in steps:
                with cmdlogger.step('Corpora preprocessing') as _:
                    preprocessor_output = self._get_tempdir('preprocessed')
                    processed_bicorpora, processed_mocorpora = self._engine.training_preprocessor.process(
                        source_lang, target_lang, corpora_roots, preprocessor_output,
                        (self._engine.data_path if split_trainingset else None)
                    )

            # Training Context Analyzer
            if 'context_analyzer' in steps:
                with cmdlogger.step('Context Analyzer training') as _:
                    log_file = self._engine.get_logfile('training.context')
                    self._engine.analyzer.create_index(unprocessed_bicorpora, source_lang, log_file=log_file)

            # Training Adaptive Language Model (on the target side of all bilingual corpora)
            if 'lm' in steps:
                with cmdlogger.step('Language Model training') as _:
                    working_dir = self._get_tempdir('lm')
                    log_file = self._engine.get_logfile('training.lm')
                    self._engine.lm.train(processed_bicorpora + processed_mocorpora, target_lang, working_dir, log_file)

            # Training Translation Model
            if 'tm' in steps:
                with cmdlogger.step('Translation Model training') as _:
                    working_dir = self._get_tempdir('tm')
                    log_file = self._engine.get_logfile('training.tm')
                    self._engine.pt.train(processed_bicorpora, self._engine.aligner, working_dir, log_file)

            # Writing config file
            with cmdlogger.step('Writing config files') as _:
                self._engine.write_configs()

            cmdlogger.completed()
        finally:
            if not debug:
                self._engine.clear_tempdir('training')
Ejemplo n.º 4
0
    def build(self, roots, debug=False, steps=None, split_trainingset=True):
        self._temp_dir = self._engine.get_tempdir('training', ensure=True)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        bilingual_corpora, monolingual_corpora = ParallelCorpus.splitlist(
            source_lang, target_lang, roots=roots)

        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' %
                (source_lang.upper(), target_lang.upper()))

        if steps is None:
            steps = self._engine.training_steps
        else:
            unknown_steps = [
                step for step in steps
                if step not in self._engine.training_steps
            ]
            if len(unknown_steps) > 0:
                raise IllegalArgumentException('Unknown training steps: ' +
                                               str(unknown_steps))

        cmdlogger = _builder_logger(len(steps) + 1)
        cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora)

        shutil.rmtree(self._engine.path, ignore_errors=True)
        os.makedirs(self._engine.path)

        # Check disk space constraints
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self.__GB * corpus_size_on_disk / (
            350 * self.__MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \
                      (recommended_mem / self.__GB, free_memory / self.__GB)
            if free_space_on_disk < recommended_disk:
                print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \
                      (recommended_disk / self.__GB, free_space_on_disk / self.__GB)
            print

        try:
            corpora_roots = roots

            unprocessed_bicorpora = bilingual_corpora
            unprocessed_mocorpora = monolingual_corpora

            # TM cleanup
            if 'tm_cleanup' in steps:
                with cmdlogger.step('TMs clean-up') as _:
                    cleaned_output = self._get_tempdir('clean_tms')
                    self._engine.cleaner.clean(source_lang, target_lang, roots,
                                               cleaned_output)

                    for corpus in monolingual_corpora:
                        cfile = corpus.get_file(target_lang)
                        link = os.path.join(cleaned_output,
                                            os.path.basename(cfile))
                        os.symlink(cfile, link)

                    corpora_roots = [cleaned_output]
                    unprocessed_bicorpora, unprocessed_mocorpora = ParallelCorpus.splitlist(
                        source_lang, target_lang, roots=corpora_roots)

            # Preprocessing
            processed_bicorpora = unprocessed_bicorpora
            processed_mocorpora = unprocessed_mocorpora

            if 'preprocess' in steps:
                with cmdlogger.step('Corpora preprocessing') as _:
                    preprocessor_output = self._get_tempdir('preprocessed')
                    processed_bicorpora, processed_mocorpora = self._engine.training_preprocessor.process(
                        source_lang, target_lang, corpora_roots,
                        preprocessor_output, (self._engine.data_path
                                              if split_trainingset else None))

            # Training Context Analyzer
            if 'context_analyzer' in steps:
                with cmdlogger.step('Context Analyzer training') as _:
                    log_file = self._engine.get_logfile('training.context')
                    self._engine.analyzer.create_index(unprocessed_bicorpora,
                                                       source_lang,
                                                       log_file=log_file)

            # Training Adaptive Language Model (on the target side of all bilingual corpora)
            if 'lm' in steps:
                with cmdlogger.step('Language Model training') as _:
                    working_dir = self._get_tempdir('lm')
                    log_file = self._engine.get_logfile('training.lm')
                    self._engine.lm.train(
                        processed_bicorpora + processed_mocorpora, target_lang,
                        working_dir, log_file)

            # Training Translation Model
            if 'tm' in steps:
                with cmdlogger.step('Translation Model training') as _:
                    working_dir = self._get_tempdir('tm')
                    log_file = self._engine.get_logfile('training.tm')
                    self._engine.pt.train(processed_bicorpora,
                                          self._engine.aligner, working_dir,
                                          log_file)

            # Writing config file
            with cmdlogger.step('Writing config files') as _:
                self._engine.write_configs()

            cmdlogger.completed()
        finally:
            if not debug:
                self._engine.clear_tempdir('training')