def __init__(self, corpus, nbest, nbest_file, workers=100): self.corpus = corpus self.nbest = nbest self.nbest_file = nbest_file self.weights = None self.skip_context = False self._line_id = 0 self._pool = multithread.Pool(workers) self._features = None self._tokenizer = Tokenizer()
def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25, early_stopping_value=None): if corpora is None: corpora = BilingualCorpus.list( os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) target_lang = self.engine.target_lang source_lang = self.engine.source_lang corpora = [ corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs ] if len(corpora) == 0: raise IllegalArgumentException( 'No %s > %s corpora found into specified path' % (source_lang, target_lang)) source_corpora = [ BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang]) for corpus in corpora ] reference_corpora = [ BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang]) for corpus in corpora ] cmdlogger = _tuning_logger(4) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: # Tokenization tokenizer = Tokenizer(target_lang) tokenized_output = os.path.join(working_dir, 'reference_corpora') fileutils.makedirs(tokenized_output, exist_ok=True) with cmdlogger.step('Corpora tokenization') as _: reference_corpora = tokenizer.process_corpora( reference_corpora, tokenized_output) # Create merged corpus with cmdlogger.step('Merging corpus') as _: # source source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: for corpus in source_corpora: out.write(corpus.get_file(source_lang) + '\n') # target target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([ corpus.get_file(target_lang) for corpus in reference_corpora ], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if self.api.root is not None: decoder_flags += ['--root', self.api.root] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [ self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR, '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table' ] if early_stopping_value is not None: command += [ '--bleuscorer', self._scorer_script, '--bleuscorer-flags "-nt" --early-stopping-value %d' % early_stopping_value ] if not random_seeds: command.append('--predictable-seeds') if max_iterations > 0: command.append('--maximum-iterations={num}'.format( num=max_iterations)) with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [ float(val) for val in tokens[1:] ] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir("tuning")
def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25): if corpora is None: corpora = BilingualCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if not self.is_running(): raise IllegalStateException('No MMT Server running, start the engine first') tokenizer = Tokenizer() target_lang = self.engine.target_lang source_lang = self.engine.source_lang source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang]) for corpus in corpora] reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang]) for corpus in corpora] cmdlogger = _tuning_logger(4) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: # Tokenization tokenized_output = os.path.join(working_dir, 'reference_corpora') fileutils.makedirs(tokenized_output, exist_ok=True) with cmdlogger.step('Corpora tokenization') as _: reference_corpora = tokenizer.process_corpora(reference_corpora, tokenized_output) # Create merged corpus with cmdlogger.step('Merging corpus') as _: # source source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: for corpus in source_corpora: out.write(corpus.get_file(source_lang) + '\n') # target target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR, '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table'] if not random_seeds: command.append('--predictable-seeds') if max_iterations > 0: command.append('--maximum-iterations={num}'.format(num=max_iterations)) with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir()
class _DocumentTranslator: def __init__(self, corpus, nbest, nbest_file, workers=100): self.corpus = corpus self.nbest = nbest self.nbest_file = nbest_file self.weights = None self.skip_context = False self._line_id = 0 self._pool = multithread.Pool(workers) self._features = None self._tokenizer = Tokenizer() def set_skipcontext(self, skip_context): self.skip_context = skip_context def set_weights(self, raw): if raw is None: self.weights = None return self.weights = {} array = [] for token in raw.split(): if token[-1:] == '=': self.weights[token.rstrip('=')] = array = [] else: array.append(float(token)) def _get_translation(self, line, nbest, session): translation = Api.translate(line, session=session, nbest=nbest) # tokenize translation['translation'] = self._tokenizer.process(translation['translation']) for hyp in translation['nbest']: hyp['translation'] = self._tokenizer.process(hyp['translation']) return translation def _print(self, translation, nbest_out): print translation['translation'].encode('utf-8') sys.stdout.flush() for hyp in translation['nbest']: scores = [] for feature in self._features: if feature in hyp['scores']: scores.append(feature + '=') for s in hyp['scores'][feature]: scores.append(str(s)) nbest_out.write(str(self._line_id)) nbest_out.write(' ||| ') nbest_out.write(hyp['translation'].encode('utf-8')) nbest_out.write(' ||| ') nbest_out.write(' '.join(scores)) nbest_out.write(' ||| ') nbest_out.write(str(hyp['totalScore'])) nbest_out.write('\n') def run(self): self._line_id = 0 try: if self.weights is not None: Api.update_features(self.weights) time.sleep(1) self._features = _sorted_features_list() translations = [] sessions = [] # Enqueue translations requests with open(self.corpus) as source: for line in source: corpus_path = line.strip() session = None if not self.skip_context: context = Api.get_context_f(corpus_path) session = Api.create_session(context)['id'] sessions.append(session) with open(corpus_path) as doc: for docline in doc: translation = self._pool.apply_async(self._get_translation, (docline, self.nbest, session)) translations.append(translation) # Collection and outputting results with open(self.nbest_file, 'ab') as nbest_out: for translation_job in translations: translation = translation_job.get() self._print(translation, nbest_out) self._line_id += 1 # Closing sessions for session in sessions: try: Api.close_session(session) except: # ignore it pass finally: self._pool.terminate()
class _DocumentTranslator: def __init__(self, corpus, nbest, nbest_file, workers=100): self.corpus = corpus self.nbest = nbest self.nbest_file = nbest_file self.weights = None self.skip_context = False self._line_id = 0 self._pool = multithread.Pool(workers) self._features = None self._tokenizer = Tokenizer() def set_skipcontext(self, skip_context): self.skip_context = skip_context def set_weights(self, raw): if raw is None: self.weights = None return self.weights = {} array = [] for token in raw.split(): if token[-1:] == '=': self.weights[token.rstrip('=')] = array = [] else: array.append(float(token)) def _get_translation(self, line, nbest, session): translation = Api.translate(line, session=session, nbest=nbest) # tokenize translation['translation'] = self._tokenizer.process( translation['translation']) for hyp in translation['nbest']: hyp['translation'] = self._tokenizer.process(hyp['translation']) return translation def _print(self, translation, nbest_out): print translation['translation'].encode('utf-8') sys.stdout.flush() for hyp in translation['nbest']: scores = [] for feature in self._features: if feature in hyp['scores']: scores.append(feature + '=') for s in hyp['scores'][feature]: scores.append(str(s)) nbest_out.write(str(self._line_id)) nbest_out.write(' ||| ') nbest_out.write(hyp['translation'].encode('utf-8')) nbest_out.write(' ||| ') nbest_out.write(' '.join(scores)) nbest_out.write(' ||| ') nbest_out.write(str(hyp['totalScore'])) nbest_out.write('\n') def run(self): self._line_id = 0 try: if self.weights is not None: Api.update_features(self.weights) time.sleep(1) self._features = _sorted_features_list() translations = [] sessions = [] # Enqueue translations requests with open(self.corpus) as source: for line in source: corpus_path = line.strip() session = None if not self.skip_context: context = Api.get_context_f(corpus_path) session = Api.create_session(context)['id'] sessions.append(session) with open(corpus_path) as doc: for docline in doc: translation = self._pool.apply_async( self._get_translation, (docline, self.nbest, session)) translations.append(translation) # Collection and outputting results with open(self.nbest_file, 'ab') as nbest_out: for translation_job in translations: translation = translation_job.get() self._print(translation, nbest_out) self._line_id += 1 # Closing sessions for session in sessions: try: Api.close_session(session) except: # ignore it pass finally: self._pool.terminate()