def build(self, corpora, working_dir='.', log=None): if log is None: log = shell.DEVNULL shutil.rmtree(self._model, ignore_errors=True) fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) merged_corpus = BilingualCorpus.make_parallel( 'merge', working_dir, (self._source_lang, self._target_lang)) fileutils.merge( [corpus.get_file(self._source_lang) for corpus in corpora], merged_corpus.get_file(self._source_lang)) fileutils.merge( [corpus.get_file(self._target_lang) for corpus in corpora], merged_corpus.get_file(self._target_lang)) command = [ self._build_bin, '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang), '-m', self._model, '-I', '4' ] shell.execute(command, stdout=log, stderr=log)
def train(self, corpora, aligner, working_dir='.', log=None): if log is None: log = shell.DEVNULL shutil.rmtree(self._model, ignore_errors=True) fileutils.makedirs(self._model, exist_ok=True) train_corpora_path = os.path.join(working_dir, 'corpora') lex_model_path = os.path.join(working_dir, 'model.tlex') if not os.path.isdir(train_corpora_path): fileutils.makedirs(train_corpora_path, exist_ok=True) train_corpora = [] # Prepare training folder for corpus in corpora: dest_corpus = BilingualCorpus.make_parallel(corpus.name, train_corpora_path, (self._source_lang, self._target_lang)) source_file = corpus.get_file(self._source_lang) target_file = corpus.get_file(self._target_lang) os.symlink(source_file, dest_corpus.get_file(self._source_lang)) os.symlink(target_file, dest_corpus.get_file(self._target_lang)) train_corpora.append(dest_corpus) # Align corpora aligner.align(train_corpora, train_corpora_path, log=log) aligner.export(lex_model_path) # Build models command = [self._build_bin, '--lex', lex_model_path, '--input', train_corpora_path, '--model', self._model, '-s', self._source_lang, '-t', self._target_lang, '-v', self._vb.model] shell.execute(command, stdout=log, stderr=log)
def encode(self, corpora, dest_folder): if not os.path.isdir(dest_folder): fileutils.makedirs(dest_folder, exist_ok=True) for corpus in corpora: for lang in corpus.langs: source = corpus.get_file(lang) dest_file = BilingualCorpus.make_parallel( corpus.name, dest_folder, [lang]).get_file(lang) self.encode_file(source, dest_file, delete_nl=True) return BilingualCorpus.list(dest_folder)
def process_corpora(self, corpora, output_folder): fileutils.makedirs(output_folder, exist_ok=True) for corpus in corpora: output_corpus = BilingualCorpus.make_parallel( corpus.name, output_folder, corpus.langs) for lang in corpus.langs: input_path = corpus.get_file(lang) output_path = output_corpus.get_file(lang) self.process_file(input_path, output_path, lang) return BilingualCorpus.list(output_folder)
def encode(self, corpora, dest_folder): if not os.path.isdir(dest_folder): osutils.makedirs(dest_folder, exist_ok=True) out_corpora = [] for corpus in corpora: out_corpus = BilingualCorpus.make_parallel(corpus.name, dest_folder, corpus.langs) for lang in corpus.langs: source = corpus.get_file(lang) dest_file = out_corpus.get_file(lang) self.encode_file(source, dest_file, delete_nl=True) out_corpora.append(out_corpus) return out_corpora
def phrase_based_tune(self, corpora, debug=False, listener=None, context_enabled=True, random_seeds=False, max_iterations=25, early_stopping_value=None): target_lang = self.engine.target_lang source_lang = self.engine.source_lang corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs] if len(corpora) == 0: raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang)) source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang]) for corpus in corpora] reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang]) for corpus in corpora] if listener is None: listener = self.TuneListener() listener.on_tuning_begin(corpora, self, 4) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: # Tokenization tokenizer = Tokenizer(source_lang=source_lang, target_lang=target_lang, print_placeholders=False, print_tags=False) tokenized_output = os.path.join(working_dir, 'reference_corpora') fileutils.makedirs(tokenized_output, exist_ok=True) with listener.step('Corpora tokenization') as _: reference_corpora = tokenizer.process_corpora(corpora=reference_corpora, output_folder=tokenized_output) # Create merged corpus with listener.step('Merging corpus') as _: # source source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: for corpus in source_corpora: out.write(corpus.get_file(source_lang) + '\n') # target target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus) # Run MERT algorithm with listener.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port), '--source', source_lang, '--target', target_lang] if self.api.root is not None: decoder_flags += ['--root', self.api.root] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR, '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table'] if early_stopping_value is not None: command += ['--bleuscorer', self._scorer_script, '--bleuscorer-flags "-nt" --early-stopping-value %d' % early_stopping_value] if not random_seeds: command.append('--predictable-seeds') if max_iterations > 0: command.append('--maximum-iterations={num}'.format(num=max_iterations)) with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with listener.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) listener.on_tuning_end(self, bleu_score) finally: if not debug: self.engine.clear_tempdir("tuning")