Ejemplo n.º 1
0
    def process(self,
                source,
                target,
                input_paths,
                output_path,
                data_path=None):
        args = ['-s', source, '-t', target, '--output', output_path, '--input']

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command,
                      stdin=shell.DEVNULL,
                      stdout=shell.DEVNULL,
                      stderr=shell.DEVNULL)

        return ParallelCorpus.splitlist(source, target, roots=output_path)
Ejemplo n.º 2
0
    def _clean_file(self, source, dest, langs):
        if not os.path.isdir(dest.root):
            fileutils.makedirs(dest.root, exist_ok=True)

        source = os.path.splitext(source.get_file(langs[0]))[0]
        output = os.path.splitext(dest.get_file(langs[0]))[0]

        command = ['perl', self._cleaner_script, '-ratio', str(self._ratio), source, langs[0], langs[1], output,
                   str(self._min), str(self._max)]
        shell.execute(command, stdout=shell.DEVNULL, stderr=shell.DEVNULL)
Ejemplo n.º 3
0
    def align(self, corpus, langs, model_dir, working_dir='.', log_file=None):
        WordAligner.align(self, corpus, langs, working_dir, log_file)

        l1 = langs[0]
        l2 = langs[1]
        corpus_name = 'corpus'
        langs_suffix = l1 + '-' + l2

        fwd_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.fwd')
        bwd_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.bwd')
        bal_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.bal')
        aligned_file_path = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.aligned')

        corpus_l1 = corpus.get_file(l1)
        corpus_l2 = corpus.get_file(l2)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            with open(corpus_l1) as source_corpus, \
                    open(corpus_l2) as target_corpus, \
                    open(aligned_file_path, 'w') as aligned_file:
                for x, y in zip(source_corpus, target_corpus):
                    aligned_file.write(x.strip() + ' ||| ' + y.strip() + '\n')

            cpus = multiprocessing.cpu_count()
            env = os.environ.copy()
            env['LD_LIBRARY_PATH'] = scripts.LIB_DIR

            # Forward alignments
            fwd_model = os.path.join(model_dir, 'model.align.fwd')
            command = [self._align_bin, '-d', '-v', '-o', '-n', str(cpus), '-B', '-p', fwd_model, '-i',
                       aligned_file_path]
            with open(fwd_file, 'w') as stdout:
                shell.execute(command, stdout=stdout, stderr=log, env=env)

            # Backward alignments
            bwd_model = os.path.join(model_dir, 'model.align.bwd')
            command = [self._align_bin, '-d', '-v', '-o', '-n', str(cpus), '-B', '-p', bwd_model, '-r', '-i',
                       aligned_file_path]
            with open(bwd_file, 'w') as stdout:
                shell.execute(command, stdout=stdout, stderr=log, env=env)

        finally:
            if log_file is not None:
                log.close()

        encoder = _FastAlignBALEncoder(corpus, langs, fwd_file, bwd_file)
        encoder.encode(bal_file)

        return bal_file
Ejemplo n.º 4
0
    def clean(self, source, target, input_paths, output_path):
        args = ['-s', source, '-t', target, '--output', output_path, '--input']

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command,
                      stdin=shell.DEVNULL,
                      stdout=shell.DEVNULL,
                      stderr=shell.DEVNULL)

        return ParallelCorpus.splitlist(source, target, roots=output_path)[0]
Ejemplo n.º 5
0
    def calculate(self, document, reference):
        script = os.path.abspath(os.path.join(__file__, os.pardir, 'opt', 'mmt-bleu.perl'))
        command = ['perl', script, reference]

        with open(document) as input_stream:
            stdout, _ = shell.execute(command, stdin=input_stream)

        return float(stdout)
Ejemplo n.º 6
0
    def __process_file(self,
                       source,
                       dest,
                       lang,
                       print_tags=True,
                       print_placeholders=False,
                       original_spacing=False):
        command = self.__get_command(lang, print_tags, print_placeholders,
                                     original_spacing)

        parent_dir = os.path.abspath(os.path.join(dest, os.pardir))
        if not os.path.isdir(parent_dir):
            fileutils.makedirs(parent_dir, exist_ok=True)

        with open(source) as input_stream:
            with open(dest, 'w') as output_stream:
                shell.execute(command,
                              stdin=input_stream,
                              stdout=output_stream,
                              stderr=shell.DEVNULL)
Ejemplo n.º 7
0
Archivo: lm.py Proyecto: FrancescoE/MMT
    def train(self, corpora, lang, working_dir=".", log_file=None):
        LanguageModel.train(self, corpora, lang, working_dir, log_file)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, "w") if isinstance(log_file, str) else log_file

            # Collapse all corpora into a single text file
            merged_corpus = os.path.join(working_dir, "merge")
            fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus)

            # Create language model in ARPA format
            arpa_file = os.path.join(working_dir, "lm.arpa")
            arpa_command = [
                self._lmplz_bin,
                "--discount_fallback",
                "-o",
                str(self._order),
                "-S",
                str(self.get_mem_percent()) + "%",
                "-T",
                working_dir,
            ]
            if self._order > 2 and self.prune:
                arpa_command += ["--prune", "0", "0", "1"]

            with open(merged_corpus) as stdin:
                with open(arpa_file, "w") as stdout:
                    shell.execute(arpa_command, stdin=stdin, stdout=stdout, stderr=log)

            # Binarize ARPA file
            binarize_command = [self._bbinary_bin, arpa_file, self._model]
            shell.execute(binarize_command, stdout=log, stderr=log)
        finally:
            if log_file is not None and isinstance(log_file, str):
                log.close()
Ejemplo n.º 8
0
    def train(self, corpora, lang, working_dir='.', log_file=None):
        LanguageModel.train(self, corpora, lang, working_dir, log_file)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w') if isinstance(log_file,
                                                        str) else log_file

            # Collapse all corpora into a single text file
            merged_corpus = os.path.join(working_dir, 'merge')
            fileutils.merge([corpus.get_file(lang) for corpus in corpora],
                            merged_corpus)

            # Create language model in ARPA format
            arpa_file = os.path.join(working_dir, 'lm.arpa')
            arpa_command = [
                self._lmplz_bin, '--discount_fallback', '-o',
                str(self._order), '-S',
                str(self.get_mem_percent()) + '%', '-T', working_dir
            ]
            if self._order > 2 and self.prune:
                arpa_command += ['--prune', '0', '0', '1']

            with open(merged_corpus) as stdin:
                with open(arpa_file, 'w') as stdout:
                    shell.execute(arpa_command,
                                  stdin=stdin,
                                  stdout=stdout,
                                  stderr=log)

            # Binarize ARPA file
            binarize_command = [self._bbinary_bin, arpa_file, self._model]
            shell.execute(binarize_command, stdout=log, stderr=log)
        finally:
            if log_file is not None and isinstance(log_file, str):
                log.close()
Ejemplo n.º 9
0
    def create_index(self, corpora, lang, log_file=None):
        source_paths = set()

        for corpus in corpora:
            source_paths.add(corpus.root)

        fileutils.makedirs(self._index, exist_ok=True)

        args = ['-l', lang, '-i', self._index, '-c']
        for source_path in source_paths:
            args.append(source_path)

        command = mmt_javamain(self._java_mainclass, args)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            shell.execute(command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Ejemplo n.º 10
0
Archivo: lm.py Proyecto: FrancescoE/MMT
    def train(self, corpora, lang, working_dir=".", log_file=None):
        LanguageModel.train(self, corpora, lang, working_dir, log_file)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, "w")

            # Collapse all corpora into a single text file
            merged_corpus = os.path.join(working_dir, "merge")
            fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus)
            input_se = os.path.join(working_dir, "static_input.se")
            temp = os.path.join(working_dir, "temp")
            arpa_file = os.path.join(working_dir, "static_lm.arpa")

            # Add start and end symbols
            with open(merged_corpus) as stdin:
                with open(input_se, "w") as stdout:
                    shell.execute([self._addbound_bin], stdin=stdin, stdout=stdout, stderr=log)

            # Creating lm in ARPA format
            command = [
                self._buildlm_bin,
                "-i",
                input_se,
                "-k",
                str(cpu_count()),
                "-o",
                arpa_file,
                "-n",
                str(self._order),
                "-s",
                "witten-bell",
                "-t",
                temp,
                "-l",
                "/dev/stdout",
                "-irstlm",
                self._irstlm_dir,
                "--PruneSingletons",
            ]
            shell.execute(command, stderr=log)

            # Create binary lm
            command = [self._compilelm_bin, arpa_file + ".gz", self._model]
            shell.execute(command, stderr=log)

        finally:
            if log_file is not None:
                log.close()
Ejemplo n.º 11
0
    def train(self, corpora, lang, working_dir='.', log_file=None):
        LanguageModel.train(self, corpora, lang, working_dir, log_file)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            # Collapse all corpora into a single text file
            merged_corpus = os.path.join(working_dir, 'merge')
            fileutils.merge([corpus.get_file(lang) for corpus in corpora],
                            merged_corpus)
            input_se = os.path.join(working_dir, 'static_input.se')
            temp = os.path.join(working_dir, 'temp')
            arpa_file = os.path.join(working_dir, 'static_lm.arpa')

            # Add start and end symbols
            with open(merged_corpus) as stdin:
                with open(input_se, 'w') as stdout:
                    shell.execute([self._addbound_bin],
                                  stdin=stdin,
                                  stdout=stdout,
                                  stderr=log)

            # Creating lm in ARPA format
            command = [
                self._buildlm_bin, '-i', input_se, '-k',
                str(cpu_count()), '-o', arpa_file, '-n',
                str(self._order), '-s', 'witten-bell', '-t', temp, '-l',
                '/dev/stdout', '-irstlm', self._irstlm_dir, '--PruneSingletons'
            ]
            shell.execute(command, stderr=log)

            # Create binary lm
            command = [self._compilelm_bin, arpa_file + '.gz', self._model]
            shell.execute(command, stderr=log)

        finally:
            if log_file is not None:
                log.close()
Ejemplo n.º 12
0
    def align(self, corpus, langs, model_dir, working_dir='.', log_file=None):
        WordAligner.align(self, corpus, langs, working_dir, log_file)

        l1 = langs[0]
        l2 = langs[1]

        corpus_name = 'corpus'

        vcb1_file = os.path.join(working_dir, corpus_name + '.' + l1 + '.vcb')
        vcb2_file = os.path.join(working_dir, corpus_name + '.' + l2 + '.vcb')
        snt12_file = os.path.join(working_dir, corpus_name + '.' + l1 + '_' + l2 + '.snt')
        snt21_file = os.path.join(working_dir, corpus_name + '.' + l2 + '_' + l1 + '.snt')
        cooc12_file = os.path.join(working_dir, corpus_name + '.' + l1 + '_' + l2 + '.cooc')
        cooc21_file = os.path.join(working_dir, corpus_name + '.' + l2 + '_' + l1 + '.cooc')
        fwdc_file = os.path.join(working_dir, 'fwd.config')
        bwdc_file = os.path.join(working_dir, 'bwd.config')
        fwddict_file = os.path.join(working_dir, corpus_name + '.fwd.dict')
        bwddict_file = os.path.join(working_dir, corpus_name + '.bwd.dict')
        fwd_file = os.path.join(working_dir, corpus_name + '.fwd')
        bwd_file = os.path.join(working_dir, corpus_name + '.bwd')
        bal_file = os.path.join(working_dir, corpus_name + '.bal')

        corpus_l1 = corpus.get_file(l1)
        corpus_l2 = corpus.get_file(l2)

        log = shell.DEVNULL

        try:
            ncpus = max(2, multiprocessing.cpu_count())

            if log_file is not None:
                log = open(log_file, 'a')

            # Translate the corpora into GIZA format
            command = [self._plain2snt_bin, corpus_l1, corpus_l2, '-vcb1', vcb1_file, '-vcb2', vcb2_file, '-snt1',
                       snt12_file, '-snt2', snt21_file]
            shell.execute(command, stdout=log, stderr=log)

            # Create the cooccurence
            command = [self._snt2cooc_bin, cooc12_file, vcb1_file, vcb2_file, snt12_file]
            shell.execute(command, stdout=log, stderr=log)

            command = [self._snt2cooc_bin, cooc21_file, vcb2_file, vcb1_file, snt21_file]
            shell.execute(command, stdout=log, stderr=log)

            # Forward alignments
            with open(fwdc_file, 'w') as config:
                config.write(self.__mgiza_config_template.format(
                    coocurrencefile=cooc12_file,
                    corpusfile=snt12_file,
                    outputfileprefix=fwddict_file,
                    sourcevocabularyfile=vcb1_file,
                    targetvocabularyfile=vcb2_file,
                    ncpus=ncpus
                ))
            command = [self._mgiza_bin, fwdc_file]
            shell.execute(command, stdout=log, stderr=log)

            parts = [fwddict_file + '.A3.final.part{part:03d}'.format(part=part) for part in range(0, ncpus)]
            command = ['python', self._merge_bin] + parts
            with open(fwd_file, 'w') as stdout:
                shell.execute(command, stdout=stdout, stderr=log)

            # Backward alignments
            with open(bwdc_file, 'w') as config:
                config.write(self.__mgiza_config_template.format(
                    coocurrencefile=cooc21_file,
                    corpusfile=snt21_file,
                    outputfileprefix=bwddict_file,
                    sourcevocabularyfile=vcb2_file,
                    targetvocabularyfile=vcb1_file,
                    ncpus=ncpus
                ))
            command = [self._mgiza_bin, bwdc_file]
            shell.execute(command, stdout=log, stderr=log)

            parts = [bwddict_file + '.A3.final.part{part:03d}'.format(part=part) for part in range(0, ncpus)]
            command = ['python', self._merge_bin] + parts
            with open(bwd_file, 'w') as stdout:
                shell.execute(command, stdout=stdout, stderr=log)

            # Create BAL file
            command = [self._giza2bal_bin, '-i', bwd_file, '-d', fwd_file]
            with open(bal_file, 'w') as stdout:
                shell.execute(command, stdout=stdout, stderr=log)
        finally:
            if log_file is not None:
                log.close()

        return bal_file
Ejemplo n.º 13
0
    def train(self, corpora, aligner, working_dir='.', log_file=None):
        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        l1 = self._source_lang
        l2 = self._target_lang
        langs = (l1, l2)
        langs_suffix = l1 + '-' + l2

        mct_base = self._get_model_basename()
        dmp_file = mct_base + '.dmp'
        mam_file = mct_base + '.' + langs_suffix + '.mam'
        lex_file = mct_base + '.' + langs_suffix + '.lex'

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Clean corpus for training
            clean_output = os.path.join(working_dir, 'clean_corpora')
            fileutils.makedirs(clean_output, exist_ok=True)
            corpora = self._cleaner.clean(corpora, clean_output, (self._source_lang, self._target_lang))

            # Create merged corpus and domains list file (dmp)
            merged_corpus = ParallelCorpus(os.path.basename(mct_base), working_dir, langs)

            fileutils.merge([corpus.get_file(l1) for corpus in corpora], merged_corpus.get_file(l1))
            fileutils.merge([corpus.get_file(l2) for corpus in corpora], merged_corpus.get_file(l2))
            with open(dmp_file, 'w') as dmp:
                for corpus in corpora:
                    dmp.write(str(corpus.name) + ' ' + str(corpus.count_lines()) + '\n')

            # Create alignments in 'bal' file and symmetrize
            bal_file = aligner.align(merged_corpus, langs, self._model, working_dir, log_file)

            symal_file = os.path.join(working_dir, 'alignments.' + langs_suffix + '.symal')
            symal_command = [self._symal_bin, '-a=g', '-d=yes', '-f=yes', '-b=yes']
            with open(bal_file) as stdin:
                with open(symal_file, 'w') as stdout:
                    shell.execute(symal_command, stdin=stdin, stdout=stdout, stderr=log)

            # Execute mtt-build
            mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l1)
            with open(merged_corpus.get_file(l1)) as stdin:
                shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log)

            mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l2)
            with open(merged_corpus.get_file(l2)) as stdin:
                shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log)

            # Create 'mam' file
            mam_command = [self._symal2mam_bin, mam_file]
            with open(symal_file) as stdin:
                shell.execute(mam_command, stdin=stdin, stdout=log, stderr=log)

            # Create 'lex' file
            lex_command = [self._mmlexbuild_bin, mct_base + '.', l1, l2, '-o', lex_file]
            shell.execute(lex_command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Ejemplo n.º 14
0
    def tune(self, corpora=None, tokenize=True, debug=False, context_enabled=True):
        if corpora is None:
            corpora = ParallelCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME))

        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if not self.is_running():
            raise IllegalStateException('No MMT Server running, start the engine first')

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        cmdlogger = _tuning_logger(4 if tokenize else 3)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            original_corpora = corpora

            # Tokenization
            tokenized_corpora = original_corpora

            if tokenize:
                tokenizer_output = os.path.join(working_dir, 'tokenized_corpora')
                fileutils.makedirs(tokenizer_output, exist_ok=True)

                with cmdlogger.step('Corpus tokenization') as _:
                    tokenized_corpora = self.engine.preprocessor.process(corpora, tokenizer_output, print_tags=False,
                                                                         print_placeholders=True,
                                                                         original_spacing=False)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang)
                with open(source_merged_corpus, 'wb') as out:
                    original_root = original_corpora[0].root

                    for corpus in tokenized_corpora:
                        tokenized = corpus.get_file(source_lang)
                        original = os.path.join(original_root, corpus.name + '.' + source_lang)
                        out.write(tokenized + ':' + original + '\n')

                target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in tokenized_corpora], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [self._mert_script, source_merged_corpus, target_merged_corpus,
                               self._mert_i_script, runtime_moses_ini.name, '--threads',
                               str(multiprocessing.cpu_count()), '--mertdir', os.path.join(Moses.bin_path, 'bin'),
                               '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100',
                               '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest',
                               '--no-filter-phrase-table']

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command), stdout=log, stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir()
Ejemplo n.º 15
0
    def tune(self,
             corpora=None,
             tokenize=True,
             debug=False,
             context_enabled=True):
        if corpora is None:
            corpora = ParallelCorpus.list(
                os.path.join(self.engine.data_path,
                             TrainingPreprocessor.DEV_FOLDER_NAME))

        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if not self.is_running():
            raise IllegalStateException(
                'No MMT Server running, start the engine first')

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        cmdlogger = _tuning_logger(4 if tokenize else 3)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            original_corpora = corpora

            # Tokenization
            tokenized_corpora = original_corpora

            if tokenize:
                tokenizer_output = os.path.join(working_dir,
                                                'tokenized_corpora')
                fileutils.makedirs(tokenizer_output, exist_ok=True)

                with cmdlogger.step('Corpus tokenization') as _:
                    tokenized_corpora = self.engine.preprocessor.process(
                        corpora,
                        tokenizer_output,
                        print_tags=False,
                        print_placeholders=True,
                        original_spacing=False)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                source_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + source_lang)
                with open(source_merged_corpus, 'wb') as out:
                    original_root = original_corpora[0].root

                    for corpus in tokenized_corpora:
                        tokenized = corpus.get_file(source_lang)
                        original = os.path.join(
                            original_root, corpus.name + '.' + source_lang)
                        out.write(tokenized + ':' + original + '\n')

                target_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + target_lang)
                fileutils.merge([
                    corpus.get_file(target_lang)
                    for corpus in tokenized_corpora
                ], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [
                        self._mert_script, source_merged_corpus,
                        target_merged_corpus, self._mert_i_script,
                        runtime_moses_ini.name, '--threads',
                        str(multiprocessing.cpu_count()), '--mertdir',
                        os.path.join(Moses.bin_path, 'bin'), '--mertargs',
                        '\'--binary --sctype BLEU\'', '--working-dir', mert_wd,
                        '--nbest', '100', '--decoder-flags',
                        '"' + ' '.join(decoder_flags) + '"', '--nonorm',
                        '--closest', '--no-filter-phrase-table'
                    ]

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command),
                                      stdout=log,
                                      stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [
                                float(val) for val in tokens[1:]
                            ]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir()