def clean(self, corpora, output_path, log=None): if log is None: log = shell.DEVNULL # read memory size mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf( 'SC_PHYS_PAGES') # e.g. 4015976448 mem_mb = mem_bytes / (1024.**2) # e.g. 3.74 extended_heap_mb = int(mem_mb * 90 / 100) args = [ '-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input' ] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args=args, max_heap_mb=extended_heap_mb) shell.execute(command, stdout=log, stderr=log) return BilingualCorpus.list(output_path)
def build(self, corpora, working_dir='.', log_file=None): if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) merged_corpus = BilingualCorpus.make_parallel( 'merge', working_dir, (self._source_lang, self._target_lang)) fileutils.merge( [corpus.get_file(self._source_lang) for corpus in corpora], merged_corpus.get_file(self._source_lang)) fileutils.merge( [corpus.get_file(self._target_lang) for corpus in corpora], merged_corpus.get_file(self._target_lang)) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Train model command = [ self._build_bin, '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang), '-m', self._model, '-I', '4' ] shell.execute(command, stderr=log) finally: if log_file is not None: log.close()
def build(self, corpora, working_dir='.', log=None): if log is None: log = shell.DEVNULL shutil.rmtree(self._model, ignore_errors=True) fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) merged_corpus = BilingualCorpus.make_parallel( 'merge', working_dir, (self._source_lang, self._target_lang)) fileutils.merge( [corpus.get_file(self._source_lang) for corpus in corpora], merged_corpus.get_file(self._source_lang)) fileutils.merge( [corpus.get_file(self._target_lang) for corpus in corpora], merged_corpus.get_file(self._target_lang)) command = [ self._build_bin, '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang), '-m', self._model, '-I', '4' ] shell.execute(command, stdout=log, stderr=log)
def train(self, corpora, aligner, working_dir='.', log=None): if log is None: log = shell.DEVNULL shutil.rmtree(self._model, ignore_errors=True) fileutils.makedirs(self._model, exist_ok=True) train_corpora_path = os.path.join(working_dir, 'corpora') lex_model_path = os.path.join(working_dir, 'model.tlex') if not os.path.isdir(train_corpora_path): fileutils.makedirs(train_corpora_path, exist_ok=True) train_corpora = [] # Prepare training folder for corpus in corpora: dest_corpus = BilingualCorpus.make_parallel(corpus.name, train_corpora_path, (self._source_lang, self._target_lang)) source_file = corpus.get_file(self._source_lang) target_file = corpus.get_file(self._target_lang) os.symlink(source_file, dest_corpus.get_file(self._source_lang)) os.symlink(target_file, dest_corpus.get_file(self._target_lang)) train_corpora.append(dest_corpus) # Align corpora aligner.align(train_corpora, train_corpora_path, log=log) aligner.export(lex_model_path) # Build models command = [self._build_bin, '--lex', lex_model_path, '--input', train_corpora_path, '--model', self._model, '-s', self._source_lang, '-t', self._target_lang, '-v', self._vb.model] shell.execute(command, stdout=log, stderr=log)
def process(self, corpora, output_path, data_path=None): args = [ '-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path, '--input' ] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append( os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append( os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
def train(self, corpora, aligner, working_dir='.', log_file=None): if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Prepare training folder for corpus in corpora: dest_corpus = BilingualCorpus.make_parallel(corpus.name, working_dir, (self._source_lang, self._target_lang)) source_file = corpus.get_file(self._source_lang) target_file = corpus.get_file(self._target_lang) os.symlink(source_file, dest_corpus.get_file(self._source_lang)) os.symlink(target_file, dest_corpus.get_file(self._target_lang)) aligner.align(corpus, os.path.join(working_dir, corpus.name + '.align')) # Build models command = [self._build_bin, '--input', working_dir, '--model', self._model, '-s', self._source_lang, '-t', self._target_lang] shell.execute(command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def build(self, corpora, working_dir='.', log_file=None): if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) merged_corpus = BilingualCorpus.make_parallel('merge', working_dir, (self._source_lang, self._target_lang)) fileutils.merge([corpus.get_file(self._source_lang) for corpus in corpora], merged_corpus.get_file(self._source_lang)) fileutils.merge([corpus.get_file(self._target_lang) for corpus in corpora], merged_corpus.get_file(self._target_lang)) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Train model command = [self._build_bin, '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang), '-m', self._model, '-I', '4'] shell.execute(command, stderr=log) finally: if log_file is not None: log.close()
def train(self, corpora, lang, working_dir='.', log_file=None): LanguageModel.train(self, corpora, lang, working_dir, log_file) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') if isinstance(log_file, str) else log_file # Collapse all corpora into a single text file merged_corpus = os.path.join(working_dir, 'merge') fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus) # Create language model in ARPA format arpa_file = os.path.join(working_dir, 'lm.arpa') arpa_command = [self._lmplz_bin, '--discount_fallback', '-o', str(self._order), '-S', str(self.get_mem_percent()) + '%', '-T', working_dir] if self._order > 2 and self.prune: arpa_command += ['--prune', '0', '0', '1'] with open(merged_corpus) as stdin: with open(arpa_file, 'w') as stdout: shell.execute(arpa_command, stdin=stdin, stdout=stdout, stderr=log) # Binarize ARPA file binarize_command = [self._bbinary_bin, arpa_file, self._model] shell.execute(binarize_command, stdout=log, stderr=log) finally: if log_file is not None and isinstance(log_file, str): log.close()
def train(self, corpora, lang, working_dir='.', log=None): if log is None: log = shell.DEVNULL LanguageModel.train(self, corpora, lang, working_dir, log) # Collapse all corpora into a single text file merged_corpus = os.path.join(working_dir, 'merge') fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus) # Create language model in ARPA format arpa_file = os.path.join(working_dir, 'lm.arpa') arpa_command = [ self._lmplz_bin, '--discount_fallback', '-o', str(self._order), '-S', str(self.get_mem_percent()) + '%', '-T', working_dir ] if self._order > 2 and self.prune: arpa_command += ['--prune', '0', '0', '1'] with open(merged_corpus) as stdin: with open(arpa_file, 'w') as stdout: shell.execute(arpa_command, stdin=stdin, stdout=stdout, stderr=log) # Binarize ARPA file binarize_command = [self._bbinary_bin, arpa_file, self._model] shell.execute(binarize_command, stdout=log, stderr=log)
def train(self, corpora, aligner, working_dir='.', log=None): if log is None: log = shell.DEVNULL if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) train_corpora = [] # Prepare training folder for corpus in corpora: dest_corpus = BilingualCorpus.make_parallel(corpus.name, working_dir, (self._source_lang, self._target_lang)) source_file = corpus.get_file(self._source_lang) target_file = corpus.get_file(self._target_lang) os.symlink(source_file, dest_corpus.get_file(self._source_lang)) os.symlink(target_file, dest_corpus.get_file(self._target_lang)) train_corpora.append(dest_corpus) # Align corpora aligner.align(train_corpora, working_dir, log=log) # Build models command = [self._build_bin, '--input', working_dir, '--model', self._model, '-s', self._source_lang, '-t', self._target_lang] shell.execute(command, stdout=log, stderr=log)
def align(self, corpus, output): command = [ self._align_bin, '-s', corpus.get_file(self._source_lang), '-t', corpus.get_file(self._target_lang), '-m', self._model, '-a', '1' ] with open(output, 'w') as stdout: shell.execute(command, stdout=stdout)
def train(self, corpora, lang, working_dir='.', log_file=None): LanguageModel.train(self, corpora, lang, working_dir, log_file) bicorpora = [] for corpus in corpora: if len(corpus.langs) > 1: bicorpora.append(corpus) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') fileutils.makedirs(self._model, exist_ok=True) # Train static LM static_lm_model = os.path.join(self._model, 'background.slm') static_lm_wdir = os.path.join(working_dir, 'slm.temp') fileutils.makedirs(static_lm_wdir, exist_ok=True) merged_corpus = os.path.join(working_dir, 'merged_corpus') fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus) command = [ self._create_slm_bin, '--discount_fallback', '-o', str(self._order), '--model', static_lm_model, '-S', str(KenLM.get_mem_percent()) + '%', '-T', static_lm_wdir ] if self._order > 2 and self.prune: command += ['--prune', '0', '0', '1'] with open(merged_corpus) as stdin: shell.execute(command, stdin=stdin, stdout=log, stderr=log) # Create AdaptiveLM training folder alm_train_folder = os.path.join(working_dir, 'alm_train') fileutils.makedirs(alm_train_folder, exist_ok=True) for corpus in bicorpora: os.symlink( corpus.get_file(lang), os.path.join(alm_train_folder, corpus.name + '.' + lang)) # Train adaptive LM adaptive_lm_model = os.path.join(self._model, 'foreground.alm') fileutils.makedirs(adaptive_lm_model, exist_ok=True) command = [ self._create_alm_bin, '-m', adaptive_lm_model, '-i', alm_train_folder, '-b', '100000000' ] shell.execute(command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def __process_file(self, source, dest, lang, print_tags=True, print_placeholders=False, original_spacing=False): command = self.__get_command(lang, print_tags, print_placeholders, original_spacing) if not os.path.isdir(dest.get_folder()): fileutils.makedirs(dest.get_folder(), exist_ok=True) with open(source) as input_stream: with open(dest.get_file(lang), 'w') as output_stream: shell.execute(command, stdin=input_stream, stdout=output_stream, stderr=shell.DEVNULL)
def train(self, corpora, lang, working_dir='.', log=None): if log is None: log = shell.DEVNULL bicorpora = [] for corpus in corpora: if len(corpus.langs) > 1: bicorpora.append(corpus) shutil.rmtree(self._model, ignore_errors=True) fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) # Train static LM static_lm_model = os.path.join(self._model, 'background.slm') static_lm_wdir = os.path.join(working_dir, 'slm.temp') fileutils.makedirs(static_lm_wdir, exist_ok=True) merged_corpus = os.path.join(working_dir, 'merged_corpus') fileutils.merge([corpus.get_file(lang) for corpus in corpora], merged_corpus) command = [ self._create_slm_bin, '--discount-fallback', '-o', str(self._order), '-a', str(self._compression), '-q', str(self._quantization), '--type', 'trie', '--model', static_lm_model, '-T', static_lm_wdir ] if self._order > 2 and self._prune: command += ['--prune', '0', '1', '2'] with open(merged_corpus) as stdin: shell.execute(command, stdin=stdin, stdout=log, stderr=log) # Create AdaptiveLM training folder alm_train_folder = os.path.join(working_dir, 'alm_train') fileutils.makedirs(alm_train_folder, exist_ok=True) for corpus in bicorpora: os.symlink( corpus.get_file(lang), os.path.join(alm_train_folder, corpus.name + '.' + lang)) # Train adaptive LM adaptive_lm_model = os.path.join(self._model, 'foreground.alm') fileutils.makedirs(adaptive_lm_model, exist_ok=True) command = [ self._create_alm_bin, '-m', adaptive_lm_model, '-i', alm_train_folder, '-b', '50000000' ] shell.execute(command, stdout=log, stderr=log)
def clean(self, source, target, input_paths, output_path): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(source, target, roots=output_path)[0]
def _clean_file(self, source, dest_folder, langs): if not os.path.isdir(dest_folder): fileutils.makedirs(dest_folder, exist_ok=True) input_folder = os.path.join(source.get_folder(), source.name) output_folder = os.path.join(dest_folder, source.name) command = ['perl', self._cleaner_script, '-ratio', str(self._ratio), input_folder, langs[0], langs[1], output_folder, str(self._min), str(self._max)] shell.execute(command, stdout=shell.DEVNULL, stderr=shell.DEVNULL)
def process_file(self, source, dest, lang): args = ['--lang', self._lang] if not self._print_tags: args.append('--no-tags') if self._print_placeholders: args.append('--print-placeholders') command = mmt_javamain(self._java_mainclass, args=args) with open(source) as input_stream: with open(dest.get_file(lang), 'w') as output_stream: shell.execute(command, stdin=input_stream, stdout=output_stream)
def clean(self, corpora, output_path): args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.list(output_path)
def clean(self, corpora, output_path, log=None): if log is None: log = shell.DEVNULL args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdout=log, stderr=log) return BilingualCorpus.list(output_path)
def align(self, corpora, output_folder, log=None): if log is None: log = shell.DEVNULL root = set([corpus.get_folder() for corpus in corpora]) if len(root) != 1: raise Exception('Aligner corpora must share the same folder: found ' + str(root)) root = root.pop() command = [self._align_bin, '--model', self._model, '--input', root, '--output', output_folder, '--source', self._source_lang, '--target', self._target_lang, '--strategy', '1'] shell.execute(command, stderr=log, stdout=log)
def generate(self, bilingual_corpora, monolingual_corpora, output, log_file=None): fileutils.makedirs(self._model, exist_ok=True) args = ['--db', os.path.join(self._model, 'domains.db'), '-l', self._source_lang, '-c'] source_paths = set([corpus.get_folder() for corpus in bilingual_corpora]) for source_path in source_paths: args.append(source_path) command = cli.mmt_javamain(self._java_mainclass, args) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') stdout, _ = shell.execute(command, stderr=log) domains = {} for domain, name in [line.rstrip('\n').split('\t', 2) for line in stdout.splitlines()]: domains[name] = domain return self._make_training_folder(bilingual_corpora, monolingual_corpora, domains, output) finally: if log_file is not None: log.close()
def process(self, source, target, input_paths, output_path, data_path=None): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(source, target, roots=output_path)
def reduce(self, corpora, output_path, word_limit, log=None): if log is None: log = shell.DEVNULL args = [ '-s', self._source_lang, '-t', self._target_lang, '--words', str(word_limit), '--output', output_path, '--input' ] for root in set([corpus.get_folder() for corpus in corpora]): args.append(root) command = mmt_javamain(self._reduce_mainclass, args=args) shell.execute(command, stdout=log, stderr=log) return BilingualCorpus.list(output_path)
def calculate(self, document, reference): script = os.path.join(cli.PYOPT_DIR, 'mmt-bleu.perl') command = ['perl', script, reference] with open(document) as input_stream: stdout, _ = shell.execute(command, stdin=input_stream) return float(stdout)
def create_index(self, corpora, log=None): if log is None: log = shell.DEVNULL source_paths = set() for corpus in corpora: source_paths.add(corpus.get_folder()) shutil.rmtree(self._index, ignore_errors=True) fileutils.makedirs(self._index, exist_ok=True) args = ['-s', self._source_lang, '-t', self._target_lang, '-i', self._index, '-c'] for source_path in source_paths: args.append(source_path) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdout=log, stderr=log)
def _get_gpus_ram(self, gpu_ids): result = [] command = ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits", "--id=%s" % ",".join(str(i) for i in gpu_ids)] stdout, _ = shell.execute(command) for line in stdout.split("\n"): line = line.strip() if line: result.append(int(line) * self._MB) return result
def process(self, corpora, output_path, data_path=None): args = ['-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
def process_file(self, input_path, output_path, lang): if lang == self._source_lang: args = ['-s', self._source_lang, '-t', self._target_lang] elif lang == self._target_lang: args = ['-s', self._target_lang, '-t', self._source_lang] else: raise ValueError('Unsupported language "%s"' % lang) if not self._print_tags: args.append('--no-tags') if self._print_placeholders: args.append('--print-placeholders') command = mmt_javamain(self._java_mainclass, args=args) with open(input_path) as input_stream: with open(output_path, 'w') as output_stream: shell.execute(command, stdin=input_stream, stdout=output_stream)
def train(self, corpora, aligner, working_dir='.', log_file=None): if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Prepare training folder for corpus in corpora: dest_corpus = BilingualCorpus.make_parallel( corpus.name, working_dir, (self._source_lang, self._target_lang)) source_file = corpus.get_file(self._source_lang) target_file = corpus.get_file(self._target_lang) os.symlink(source_file, dest_corpus.get_file(self._source_lang)) os.symlink(target_file, dest_corpus.get_file(self._target_lang)) aligner.align( corpus, os.path.join(working_dir, corpus.name + '.align')) # Build models command = [ self._build_bin, '--input', working_dir, '--model', self._model, '-s', self._source_lang, '-t', self._target_lang ] shell.execute(command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def create_index(self, corpora, lang, log_file=None): source_paths = set() for corpus in corpora: source_paths.add(corpus.get_folder()) fileutils.makedirs(self._index, exist_ok=True) args = ['-l', lang, '-i', self._index, '-c'] for source_path in source_paths: args.append(source_path) command = mmt_javamain(self._java_mainclass, args) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') shell.execute(command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def _tune_run(self, decoder, corpora, lr, epochs, output_file, reference_file): with open(output_file, 'wb') as output: for source, target in corpora: if lr == 0.: suggestions = None else: suggestions = [Suggestion(source, target, 1.)] translation = decoder.translate(self.source_lang, self.target_lang, source, suggestions=suggestions, tuning_epochs=epochs, tuning_learning_rate=lr) output.write(translation.text.encode('utf-8')) output.write('\n') command = ['perl', self._bleu_script, reference_file] with open(output_file) as input_stream: stdout, _ = shell.execute(command, stdin=input_stream) return float(stdout) * 100
def generate(self, bilingual_corpora, monolingual_corpora, output, log=None): if log is None: log = shell.DEVNULL fileutils.makedirs(self._model, exist_ok=True) args = [ '--db', os.path.join(self._model, 'domains.db'), '-s', self._source_lang, '-t', self._target_lang, '-c' ] source_paths = set( [corpus.get_folder() for corpus in bilingual_corpora]) for source_path in source_paths: args.append(source_path) command = cli.mmt_javamain(self._java_mainclass, args) stdout, _ = shell.execute(command, stderr=log) domains = {} for domain, name in [ line.rstrip('\n').split('\t', 2) for line in stdout.splitlines() ]: domains[name] = domain bilingual_corpora = [ corpus.symlink(output, name=domains[corpus.name]) for corpus in bilingual_corpora ] monolingual_corpora = [ corpus.symlink(output) for corpus in monolingual_corpora ] return bilingual_corpora, monolingual_corpora
def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25): if corpora is None: corpora = BilingualCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if not self.is_running(): raise IllegalStateException('No MMT Server running, start the engine first') tokenizer = Tokenizer() target_lang = self.engine.target_lang source_lang = self.engine.source_lang source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang]) for corpus in corpora] reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang]) for corpus in corpora] cmdlogger = _tuning_logger(4) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: # Tokenization tokenized_output = os.path.join(working_dir, 'reference_corpora') fileutils.makedirs(tokenized_output, exist_ok=True) with cmdlogger.step('Corpora tokenization') as _: reference_corpora = tokenizer.process_corpora(reference_corpora, tokenized_output) # Create merged corpus with cmdlogger.step('Merging corpus') as _: # source source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: for corpus in source_corpora: out.write(corpus.get_file(source_lang) + '\n') # target target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR, '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table'] if not random_seeds: command.append('--predictable-seeds') if max_iterations > 0: command.append('--maximum-iterations={num}'.format(num=max_iterations)) with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir()
def align(self, corpus, langs, model_dir, working_dir='.', log_file=None): WordAligner.align(self, corpus, langs, working_dir, log_file) l1 = langs[0] l2 = langs[1] corpus_name = 'corpus' langs_suffix = l1 + '-' + l2 fwd_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.fwd') bwd_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.bwd') bal_file = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.bal') aligned_file_path = os.path.join(working_dir, corpus_name + '.' + langs_suffix + '.aligned') corpus_l1 = corpus.get_file(l1) corpus_l2 = corpus.get_file(l2) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') with open(corpus_l1) as source_corpus, \ open(corpus_l2) as target_corpus, \ open(aligned_file_path, 'w') as aligned_file: for x, y in zip(source_corpus, target_corpus): aligned_file.write(x.strip() + ' ||| ' + y.strip() + '\n') cpus = multiprocessing.cpu_count() # Create forward model fwd_model = os.path.join(model_dir, 'model.align.fwd') command = [self._align_bin, '-d', '-v', '-o', '-n', str(cpus), '-B', '-p', fwd_model, '-i', aligned_file_path] shell.execute(command, stderr=log) # Compute forward alignments command = [self._align_bin, '-d', '-v', '-o', '-n', str(cpus), '-B', '-f', fwd_model, '-i', aligned_file_path] with open(fwd_file, 'w') as stdout: shell.execute(command, stdout=stdout, stderr=log) # Create backward model bwd_model = os.path.join(model_dir, 'model.align.bwd') command = [self._align_bin, '-d', '-v', '-o', '-n', str(cpus), '-B', '-p', bwd_model, '-r', '-i', aligned_file_path] shell.execute(command, stderr=log) # Compute backward alignments command = [self._align_bin, '-d', '-v', '-o', '-n', str(cpus), '-B', '-f', bwd_model, '-r', '-i', aligned_file_path] with open(bwd_file, 'w') as stdout: shell.execute(command, stdout=stdout, stderr=log) finally: if log_file is not None: log.close() encoder = _FastAlignBALEncoder(corpus, langs, fwd_file, bwd_file) encoder.encode(bal_file) return bal_file
def train(self, corpora, aligner, working_dir='.', log_file=None): if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) l1 = self._source_lang l2 = self._target_lang langs = (l1, l2) langs_suffix = l1 + '-' + l2 mct_base = self._get_model_basename() dmp_file = mct_base + '.dmp' mam_file = mct_base + '.' + langs_suffix + '.mam' lex_file = mct_base + '.' + langs_suffix + '.lex' log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Clean corpus for training clean_output = os.path.join(working_dir, 'clean_corpora') fileutils.makedirs(clean_output, exist_ok=True) corpora = self._cleaner.clean(corpora, clean_output, (self._source_lang, self._target_lang)) # Create merged corpus and domains list file (dmp) merged_corpus = BilingualCorpus.make_parallel(os.path.basename(mct_base), working_dir, langs) fileutils.merge([corpus.get_file(l1) for corpus in corpora], merged_corpus.get_file(l1)) fileutils.merge([corpus.get_file(l2) for corpus in corpora], merged_corpus.get_file(l2)) with open(dmp_file, 'w') as dmp: for corpus in corpora: dmp.write(str(corpus.name) + ' ' + str(corpus.count_lines()) + '\n') # Create alignments in 'bal' file and symmetrize bal_file = aligner.align(merged_corpus, langs, self._model, working_dir, log_file) symal_file = os.path.join(working_dir, 'alignments.' + langs_suffix + '.symal') symal_command = [self._symal_bin, '-a=g', '-d=yes', '-f=yes', '-b=yes'] with open(bal_file) as stdin: with open(symal_file, 'w') as stdout: shell.execute(symal_command, stdin=stdin, stdout=stdout, stderr=log) # Execute mtt-build mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l1) with open(merged_corpus.get_file(l1)) as stdin: shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log) mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l2) with open(merged_corpus.get_file(l2)) as stdin: shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log) # Create 'mam' file mam_command = [self._symal2mam_bin, mam_file] with open(symal_file) as stdin: shell.execute(mam_command, stdin=stdin, stdout=log, stderr=log) # Create 'lex' file lex_command = [self._mmlexbuild_bin, mct_base + '.', l1, l2, '-o', lex_file] shell.execute(lex_command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25, early_stopping_value=None): if corpora is None: corpora = BilingualCorpus.list( os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) target_lang = self.engine.target_lang source_lang = self.engine.source_lang corpora = [ corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs ] if len(corpora) == 0: raise IllegalArgumentException( 'No %s > %s corpora found into specified path' % (source_lang, target_lang)) source_corpora = [ BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang]) for corpus in corpora ] reference_corpora = [ BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang]) for corpus in corpora ] cmdlogger = _tuning_logger(4) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: # Tokenization tokenizer = Tokenizer(target_lang) tokenized_output = os.path.join(working_dir, 'reference_corpora') fileutils.makedirs(tokenized_output, exist_ok=True) with cmdlogger.step('Corpora tokenization') as _: reference_corpora = tokenizer.process_corpora( reference_corpora, tokenized_output) # Create merged corpus with cmdlogger.step('Merging corpus') as _: # source source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: for corpus in source_corpora: out.write(corpus.get_file(source_lang) + '\n') # target target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([ corpus.get_file(target_lang) for corpus in reference_corpora ], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if self.api.root is not None: decoder_flags += ['--root', self.api.root] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [ self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR, '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table' ] if early_stopping_value is not None: command += [ '--bleuscorer', self._scorer_script, '--bleuscorer-flags "-nt" --early-stopping-value %d' % early_stopping_value ] if not random_seeds: command.append('--predictable-seeds') if max_iterations > 0: command.append('--maximum-iterations={num}'.format( num=max_iterations)) with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [ float(val) for val in tokens[1:] ] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir("tuning")
def align(self, corpus, output): command = [self._align_bin, '-s', corpus.get_file(self._source_lang), '-t', corpus.get_file(self._target_lang), '-m', self._model, '-a', '1'] with open(output, 'w') as stdout: shell.execute(command, stdout=stdout)
def export(self, path, log=None): if log is None: log = shell.DEVNULL command = [self._export_bin, '--model', self._model, '--output', path] shell.execute(command, stderr=log, stdout=log)