def process(self, source, target, input_paths, output_path, data_path=None): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append( os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append( os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return ParallelCorpus.splitlist(source, target, roots=output_path)
def _clean_file(self, source, dest, langs): if not os.path.isdir(dest.root): fileutils.makedirs(dest.root, exist_ok=True) source = os.path.splitext(source.get_file(langs[0]))[0] output = os.path.splitext(dest.get_file(langs[0]))[0] command = ['perl', self._cleaner_script, '-ratio', str(self._ratio), source, langs[0], langs[1], output, str(self._min), str(self._max)] shell.execute(command, stdout=shell.DEVNULL, stderr=shell.DEVNULL)
def align(self, corpus, langs, model_dir, working_dir='.', log_file=None): WordAligner.align(self, corpus, langs, working_dir, log_file) l1 = langs[0] l2 = langs[1] corpus_name = 'corpus' langs_suffix = l1 + '-' + l2 fwd_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.fwd') bwd_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.bwd') bal_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.bal') aligned_file_path = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.aligned') corpus_l1 = corpus.get_file(l1) corpus_l2 = corpus.get_file(l2) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') with open(corpus_l1) as source_corpus, \ open(corpus_l2) as target_corpus, \ open(aligned_file_path, 'w') as aligned_file: for x, y in zip(source_corpus, target_corpus): aligned_file.write(x.strip() + ' ||| ' + y.strip() + '\n') cpus = multiprocessing.cpu_count() env = os.environ.copy() env['LD_LIBRARY_PATH'] = scripts.LIB_DIR # Forward alignments fwd_model = os.path.join(model_dir, 'model.align.fwd') command = [self._align_bin, '-d', '-v', '-o', '-n', str(cpus), '-B', '-p', fwd_model, '-i', aligned_file_path] with open(fwd_file, 'w') as stdout: shell.execute(command, stdout=stdout, stderr=log, env=env) # Backward alignments bwd_model = os.path.join(model_dir, 'model.align.bwd') command = [self._align_bin, '-d', '-v', '-o', '-n', str(cpus), '-B', '-p', bwd_model, '-r', '-i', aligned_file_path] with open(bwd_file, 'w') as stdout: shell.execute(command, stdout=stdout, stderr=log, env=env) finally: if log_file is not None: log.close() encoder = _FastAlignBALEncoder(corpus, langs, fwd_file, bwd_file) encoder.encode(bal_file) return bal_file
def clean(self, source, target, input_paths, output_path): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return ParallelCorpus.splitlist(source, target, roots=output_path)[0]
def calculate(self, document, reference): script = os.path.abspath(os.path.join(__file__, os.pardir, 'opt', 'mmt-bleu.perl')) command = ['perl', script, reference] with open(document) as input_stream: stdout, _ = shell.execute(command, stdin=input_stream) return float(stdout)
def __process_file(self, source, dest, lang, print_tags=True, print_placeholders=False, original_spacing=False): command = self.__get_command(lang, print_tags, print_placeholders, original_spacing) parent_dir = os.path.abspath(os.path.join(dest, os.pardir)) if not os.path.isdir(parent_dir): fileutils.makedirs(parent_dir, exist_ok=True) with open(source) as input_stream: with open(dest, 'w') as output_stream: shell.execute(command, stdin=input_stream, stdout=output_stream, stderr=shell.DEVNULL)
def train(self, corpora, lang, working_dir=".", log_file=None): LanguageModel.train(self, corpora, lang, working_dir, log_file) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, "w") if isinstance(log_file, str) else log_file # Collapse all corpora into a single text file merged_corpus = os.path.join(working_dir, "merge") fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus) # Create language model in ARPA format arpa_file = os.path.join(working_dir, "lm.arpa") arpa_command = [ self._lmplz_bin, "--discount_fallback", "-o", str(self._order), "-S", str(self.get_mem_percent()) + "%", "-T", working_dir, ] if self._order > 2 and self.prune: arpa_command += ["--prune", "0", "0", "1"] with open(merged_corpus) as stdin: with open(arpa_file, "w") as stdout: shell.execute(arpa_command, stdin=stdin, stdout=stdout, stderr=log) # Binarize ARPA file binarize_command = [self._bbinary_bin, arpa_file, self._model] shell.execute(binarize_command, stdout=log, stderr=log) finally: if log_file is not None and isinstance(log_file, str): log.close()
def train(self, corpora, lang, working_dir='.', log_file=None): LanguageModel.train(self, corpora, lang, working_dir, log_file) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') if isinstance(log_file, str) else log_file # Collapse all corpora into a single text file merged_corpus = os.path.join(working_dir, 'merge') fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus) # Create language model in ARPA format arpa_file = os.path.join(working_dir, 'lm.arpa') arpa_command = [ self._lmplz_bin, '--discount_fallback', '-o', str(self._order), '-S', str(self.get_mem_percent()) + '%', '-T', working_dir ] if self._order > 2 and self.prune: arpa_command += ['--prune', '0', '0', '1'] with open(merged_corpus) as stdin: with open(arpa_file, 'w') as stdout: shell.execute(arpa_command, stdin=stdin, stdout=stdout, stderr=log) # Binarize ARPA file binarize_command = [self._bbinary_bin, arpa_file, self._model] shell.execute(binarize_command, stdout=log, stderr=log) finally: if log_file is not None and isinstance(log_file, str): log.close()
def create_index(self, corpora, lang, log_file=None): source_paths = set() for corpus in corpora: source_paths.add(corpus.root) fileutils.makedirs(self._index, exist_ok=True) args = ['-l', lang, '-i', self._index, '-c'] for source_path in source_paths: args.append(source_path) command = mmt_javamain(self._java_mainclass, args) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') shell.execute(command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def train(self, corpora, lang, working_dir=".", log_file=None): LanguageModel.train(self, corpora, lang, working_dir, log_file) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, "w") # Collapse all corpora into a single text file merged_corpus = os.path.join(working_dir, "merge") fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus) input_se = os.path.join(working_dir, "static_input.se") temp = os.path.join(working_dir, "temp") arpa_file = os.path.join(working_dir, "static_lm.arpa") # Add start and end symbols with open(merged_corpus) as stdin: with open(input_se, "w") as stdout: shell.execute([self._addbound_bin], stdin=stdin, stdout=stdout, stderr=log) # Creating lm in ARPA format command = [ self._buildlm_bin, "-i", input_se, "-k", str(cpu_count()), "-o", arpa_file, "-n", str(self._order), "-s", "witten-bell", "-t", temp, "-l", "/dev/stdout", "-irstlm", self._irstlm_dir, "--PruneSingletons", ] shell.execute(command, stderr=log) # Create binary lm command = [self._compilelm_bin, arpa_file + ".gz", self._model] shell.execute(command, stderr=log) finally: if log_file is not None: log.close()
def train(self, corpora, lang, working_dir='.', log_file=None): LanguageModel.train(self, corpora, lang, working_dir, log_file) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') # Collapse all corpora into a single text file merged_corpus = os.path.join(working_dir, 'merge') fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus) input_se = os.path.join(working_dir, 'static_input.se') temp = os.path.join(working_dir, 'temp') arpa_file = os.path.join(working_dir, 'static_lm.arpa') # Add start and end symbols with open(merged_corpus) as stdin: with open(input_se, 'w') as stdout: shell.execute([self._addbound_bin], stdin=stdin, stdout=stdout, stderr=log) # Creating lm in ARPA format command = [ self._buildlm_bin, '-i', input_se, '-k', str(cpu_count()), '-o', arpa_file, '-n', str(self._order), '-s', 'witten-bell', '-t', temp, '-l', '/dev/stdout', '-irstlm', self._irstlm_dir, '--PruneSingletons' ] shell.execute(command, stderr=log) # Create binary lm command = [self._compilelm_bin, arpa_file + '.gz', self._model] shell.execute(command, stderr=log) finally: if log_file is not None: log.close()
def align(self, corpus, langs, model_dir, working_dir='.', log_file=None): WordAligner.align(self, corpus, langs, working_dir, log_file) l1 = langs[0] l2 = langs[1] corpus_name = 'corpus' vcb1_file = os.path.join(working_dir, corpus_name + '.' + l1 + '.vcb') vcb2_file = os.path.join(working_dir, corpus_name + '.' + l2 + '.vcb') snt12_file = os.path.join(working_dir, corpus_name + '.' + l1 + '_' + l2 + '.snt') snt21_file = os.path.join(working_dir, corpus_name + '.' + l2 + '_' + l1 + '.snt') cooc12_file = os.path.join(working_dir, corpus_name + '.' + l1 + '_' + l2 + '.cooc') cooc21_file = os.path.join(working_dir, corpus_name + '.' + l2 + '_' + l1 + '.cooc') fwdc_file = os.path.join(working_dir, 'fwd.config') bwdc_file = os.path.join(working_dir, 'bwd.config') fwddict_file = os.path.join(working_dir, corpus_name + '.fwd.dict') bwddict_file = os.path.join(working_dir, corpus_name + '.bwd.dict') fwd_file = os.path.join(working_dir, corpus_name + '.fwd') bwd_file = os.path.join(working_dir, corpus_name + '.bwd') bal_file = os.path.join(working_dir, corpus_name + '.bal') corpus_l1 = corpus.get_file(l1) corpus_l2 = corpus.get_file(l2) log = shell.DEVNULL try: ncpus = max(2, multiprocessing.cpu_count()) if log_file is not None: log = open(log_file, 'a') # Translate the corpora into GIZA format command = [self._plain2snt_bin, corpus_l1, corpus_l2, '-vcb1', vcb1_file, '-vcb2', vcb2_file, '-snt1', snt12_file, '-snt2', snt21_file] shell.execute(command, stdout=log, stderr=log) # Create the cooccurence command = [self._snt2cooc_bin, cooc12_file, vcb1_file, vcb2_file, snt12_file] shell.execute(command, stdout=log, stderr=log) command = [self._snt2cooc_bin, cooc21_file, vcb2_file, vcb1_file, snt21_file] shell.execute(command, stdout=log, stderr=log) # Forward alignments with open(fwdc_file, 'w') as config: config.write(self.__mgiza_config_template.format( coocurrencefile=cooc12_file, corpusfile=snt12_file, outputfileprefix=fwddict_file, sourcevocabularyfile=vcb1_file, targetvocabularyfile=vcb2_file, ncpus=ncpus )) command = [self._mgiza_bin, fwdc_file] shell.execute(command, stdout=log, stderr=log) parts = [fwddict_file + '.A3.final.part{part:03d}'.format(part=part) for part in range(0, ncpus)] command = ['python', self._merge_bin] + parts with open(fwd_file, 'w') as stdout: shell.execute(command, stdout=stdout, stderr=log) # Backward alignments with open(bwdc_file, 'w') as config: config.write(self.__mgiza_config_template.format( coocurrencefile=cooc21_file, corpusfile=snt21_file, outputfileprefix=bwddict_file, sourcevocabularyfile=vcb2_file, targetvocabularyfile=vcb1_file, ncpus=ncpus )) command = [self._mgiza_bin, bwdc_file] shell.execute(command, stdout=log, stderr=log) parts = [bwddict_file + '.A3.final.part{part:03d}'.format(part=part) for part in range(0, ncpus)] command = ['python', self._merge_bin] + parts with open(bwd_file, 'w') as stdout: shell.execute(command, stdout=stdout, stderr=log) # Create BAL file command = [self._giza2bal_bin, '-i', bwd_file, '-d', fwd_file] with open(bal_file, 'w') as stdout: shell.execute(command, stdout=stdout, stderr=log) finally: if log_file is not None: log.close() return bal_file
def train(self, corpora, aligner, working_dir='.', log_file=None): if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) l1 = self._source_lang l2 = self._target_lang langs = (l1, l2) langs_suffix = l1 + '-' + l2 mct_base = self._get_model_basename() dmp_file = mct_base + '.dmp' mam_file = mct_base + '.' + langs_suffix + '.mam' lex_file = mct_base + '.' + langs_suffix + '.lex' log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Clean corpus for training clean_output = os.path.join(working_dir, 'clean_corpora') fileutils.makedirs(clean_output, exist_ok=True) corpora = self._cleaner.clean(corpora, clean_output, (self._source_lang, self._target_lang)) # Create merged corpus and domains list file (dmp) merged_corpus = ParallelCorpus(os.path.basename(mct_base), working_dir, langs) fileutils.merge([corpus.get_file(l1) for corpus in corpora], merged_corpus.get_file(l1)) fileutils.merge([corpus.get_file(l2) for corpus in corpora], merged_corpus.get_file(l2)) with open(dmp_file, 'w') as dmp: for corpus in corpora: dmp.write(str(corpus.name) + ' ' + str(corpus.count_lines()) + '\n') # Create alignments in 'bal' file and symmetrize bal_file = aligner.align(merged_corpus, langs, self._model, working_dir, log_file) symal_file = os.path.join(working_dir, 'alignments.' + langs_suffix + '.symal') symal_command = [self._symal_bin, '-a=g', '-d=yes', '-f=yes', '-b=yes'] with open(bal_file) as stdin: with open(symal_file, 'w') as stdout: shell.execute(symal_command, stdin=stdin, stdout=stdout, stderr=log) # Execute mtt-build mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l1) with open(merged_corpus.get_file(l1)) as stdin: shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log) mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l2) with open(merged_corpus.get_file(l2)) as stdin: shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log) # Create 'mam' file mam_command = [self._symal2mam_bin, mam_file] with open(symal_file) as stdin: shell.execute(mam_command, stdin=stdin, stdout=log, stderr=log) # Create 'lex' file lex_command = [self._mmlexbuild_bin, mct_base + '.', l1, l2, '-o', lex_file] shell.execute(lex_command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def tune(self, corpora=None, tokenize=True, debug=False, context_enabled=True): if corpora is None: corpora = ParallelCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if not self.is_running(): raise IllegalStateException('No MMT Server running, start the engine first') target_lang = self.engine.target_lang source_lang = self.engine.source_lang cmdlogger = _tuning_logger(4 if tokenize else 3) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: original_corpora = corpora # Tokenization tokenized_corpora = original_corpora if tokenize: tokenizer_output = os.path.join(working_dir, 'tokenized_corpora') fileutils.makedirs(tokenizer_output, exist_ok=True) with cmdlogger.step('Corpus tokenization') as _: tokenized_corpora = self.engine.preprocessor.process(corpora, tokenizer_output, print_tags=False, print_placeholders=True, original_spacing=False) # Create merged corpus with cmdlogger.step('Merging corpus') as _: source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: original_root = original_corpora[0].root for corpus in tokenized_corpora: tokenized = corpus.get_file(source_lang) original = os.path.join(original_root, corpus.name + '.' + source_lang) out.write(tokenized + ':' + original + '\n') target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in tokenized_corpora], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', os.path.join(Moses.bin_path, 'bin'), '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table'] with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir()
def tune(self, corpora=None, tokenize=True, debug=False, context_enabled=True): if corpora is None: corpora = ParallelCorpus.list( os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if not self.is_running(): raise IllegalStateException( 'No MMT Server running, start the engine first') target_lang = self.engine.target_lang source_lang = self.engine.source_lang cmdlogger = _tuning_logger(4 if tokenize else 3) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: original_corpora = corpora # Tokenization tokenized_corpora = original_corpora if tokenize: tokenizer_output = os.path.join(working_dir, 'tokenized_corpora') fileutils.makedirs(tokenizer_output, exist_ok=True) with cmdlogger.step('Corpus tokenization') as _: tokenized_corpora = self.engine.preprocessor.process( corpora, tokenizer_output, print_tags=False, print_placeholders=True, original_spacing=False) # Create merged corpus with cmdlogger.step('Merging corpus') as _: source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: original_root = original_corpora[0].root for corpus in tokenized_corpora: tokenized = corpus.get_file(source_lang) original = os.path.join( original_root, corpus.name + '.' + source_lang) out.write(tokenized + ':' + original + '\n') target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([ corpus.get_file(target_lang) for corpus in tokenized_corpora ], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [ self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', os.path.join(Moses.bin_path, 'bin'), '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table' ] with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [ float(val) for val in tokens[1:] ] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir()