예제 #1
0
파일: engine.py 프로젝트: argakon/MMT
    def build(self, corpora, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        merged_corpus = BilingualCorpus.make_parallel(
            'merge', working_dir, (self._source_lang, self._target_lang))

        fileutils.merge(
            [corpus.get_file(self._source_lang) for corpus in corpora],
            merged_corpus.get_file(self._source_lang))
        fileutils.merge(
            [corpus.get_file(self._target_lang) for corpus in corpora],
            merged_corpus.get_file(self._target_lang))

        command = [
            self._build_bin, '-s',
            merged_corpus.get_file(self._source_lang), '-t',
            merged_corpus.get_file(self._target_lang), '-m', self._model, '-I',
            '4'
        ]
        shell.execute(command, stdout=log, stderr=log)
예제 #2
0
    def train(self, corpora, aligner, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        fileutils.makedirs(self._model, exist_ok=True)

        train_corpora_path = os.path.join(working_dir, 'corpora')
        lex_model_path = os.path.join(working_dir, 'model.tlex')

        if not os.path.isdir(train_corpora_path):
            fileutils.makedirs(train_corpora_path, exist_ok=True)

        train_corpora = []  # Prepare training folder
        for corpus in corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, train_corpora_path,
                                                        (self._source_lang, self._target_lang))
            source_file = corpus.get_file(self._source_lang)
            target_file = corpus.get_file(self._target_lang)

            os.symlink(source_file, dest_corpus.get_file(self._source_lang))
            os.symlink(target_file, dest_corpus.get_file(self._target_lang))

            train_corpora.append(dest_corpus)

        # Align corpora
        aligner.align(train_corpora, train_corpora_path, log=log)
        aligner.export(lex_model_path)

        # Build models
        command = [self._build_bin, '--lex', lex_model_path, '--input', train_corpora_path, '--model', self._model,
                   '-s', self._source_lang, '-t', self._target_lang, '-v', self._vb.model]
        shell.execute(command, stdout=log, stderr=log)
예제 #3
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = BilingualCorpus.make_parallel(
                    corpus.name, dest_folder, [lang]).get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

        return BilingualCorpus.list(dest_folder)
예제 #4
0
    def process_corpora(self, corpora, output_folder):
        fileutils.makedirs(output_folder, exist_ok=True)

        for corpus in corpora:
            output_corpus = BilingualCorpus.make_parallel(
                corpus.name, output_folder, corpus.langs)

            for lang in corpus.langs:
                input_path = corpus.get_file(lang)
                output_path = output_corpus.get_file(lang)

                self.process_file(input_path, output_path, lang)

        return BilingualCorpus.list(output_folder)
예제 #5
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            osutils.makedirs(dest_folder, exist_ok=True)

        out_corpora = []
        for corpus in corpora:
            out_corpus = BilingualCorpus.make_parallel(corpus.name,
                                                       dest_folder,
                                                       corpus.langs)

            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = out_corpus.get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

            out_corpora.append(out_corpus)

        return out_corpora
예제 #6
0
파일: cluster.py 프로젝트: sadeghieh/MMT
    def phrase_based_tune(self, corpora, debug=False, listener=None,
                          context_enabled=True, random_seeds=False, max_iterations=25, early_stopping_value=None):
        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs]
        if len(corpora) == 0:
            raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang))

        source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang])
                          for corpus in corpora]
        reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang])
                             for corpus in corpora]

        if listener is None:
            listener = self.TuneListener()

        listener.on_tuning_begin(corpora, self, 4)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenizer = Tokenizer(source_lang=source_lang, target_lang=target_lang,
                                  print_placeholders=False, print_tags=False)
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with listener.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(corpora=reference_corpora, output_folder=tokenized_output)

            # Create merged corpus
            with listener.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus)

            # Run MERT algorithm
            with listener.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port), '--source', source_lang, '--target', target_lang]

                if self.api.root is not None:
                    decoder_flags += ['--root', self.api.root]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [self._mert_script, source_merged_corpus, target_merged_corpus,
                               self._mert_i_script, runtime_moses_ini.name, '--threads',
                               str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR,
                               '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100',
                               '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest',
                               '--no-filter-phrase-table']

                    if early_stopping_value is not None:
                        command += ['--bleuscorer', self._scorer_script,
                                    '--bleuscorer-flags "-nt" --early-stopping-value %d' % early_stopping_value]

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command), stdout=log, stderr=log)

            # Read optimized configuration
            with listener.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            listener.on_tuning_end(self, bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir("tuning")