def main_sweep(argv): parser = argparse.ArgumentParser( description= 'Sweep SA sample size and measure BLEU scores at various settings.') parser.add_argument( '-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default=None) parser.add_argument( '--path', dest='corpora_path', metavar='CORPORA', default=None, help= 'the path to the test corpora (default is the automatically splitted sample)' ) args = parser.parse_args(argv) samples = [ int(e) for e in '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'. split() ] node = ConfiguredClusterNode(args.engine) # more or less copy-pasted from mmt evaluate: evaluator = Evaluator(node.engine, node) corpora = ParallelCorpus.list(args.corpora_path) if args.corpora_path is not None \ else ParallelCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) lines = 0 for corpus in corpora: lines += corpus.count_lines() # end copy-paste print('sample bleu') for sample in samples: node.set('suffixarrays', 'sample', sample) node.apply_configs() scores = evaluator.evaluate(corpora=corpora, google_key='1234', heval_output=None, use_sessions=True, debug=False) engine_scores = scores['MMT'] if isinstance(engine_scores, str): raise RuntimeError(engine_scores) bleu = engine_scores['bleu'] print(sample, '%.2f' % (bleu * 100))
def clean(self, corpora, dest_folder, langs=None): if langs is None and len(corpora) > 0: langs = (corpora[0].langs[0], corpora[0].langs[1]) self._pool_exec(self._clean_file, [(corpus, ParallelCorpus(corpus.name, dest_folder, corpus.langs), langs) for corpus in corpora]) return ParallelCorpus.list(dest_folder)
def translate(self, corpora, output): """ Translate the given corpora in parallel processing fashion. :param corpora: list of ParallelCorpus :param output: path to output directory :return: ([ParallelCorpus, ...], time_per_sentence, parallelism) """ pool = multithread.Pool(self._threads) try: translations = [] start_time = datetime.now() for corpus in corpora: self._before_translate(corpus) with open(corpus.get_file(self.source_lang)) as source: output_path = os.path.join(output, corpus.name + '.' + self.target_lang) for line in source: translation = pool.apply_async(self._get_translation, (line, corpus)) translations.append((translation, output_path)) self._after_translate(corpus) elapsed_time = 0 translation_count = 0 path = None stream = None for translation_job, output_path in translations: translation, elapsed = translation_job.get() if output_path != path: if stream is not None: stream.close() stream = open(output_path, 'wb') path = output_path stream.write(translation.encode('utf-8')) stream.write('\n') elapsed_time += elapsed translation_count += 1 if stream is not None: stream.close() end_time = datetime.now() total_time = end_time - start_time return ParallelCorpus.list(output), (elapsed_time / translation_count), ( elapsed_time / total_time.total_seconds()) finally: pool.terminate()
def main_sweep(argv): parser = argparse.ArgumentParser(description='Sweep SA sample size and measure BLEU scores at various settings.') parser.add_argument('-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default=None) parser.add_argument('--path', dest='corpora_path', metavar='CORPORA', default=None, help='the path to the test corpora (default is the automatically splitted sample)') args = parser.parse_args(argv) samples = [int(e) for e in '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'.split()] node = ConfiguredClusterNode(args.engine) # more or less copy-pasted from mmt evaluate: evaluator = Evaluator(node.engine, node) corpora = ParallelCorpus.list(args.corpora_path) if args.corpora_path is not None \ else ParallelCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) lines = 0 for corpus in corpora: lines += corpus.count_lines() # end copy-paste print('sample bleu') for sample in samples: node.set('suffixarrays', 'sample', sample) node.apply_configs() scores = evaluator.evaluate(corpora=corpora, google_key='1234', heval_output=None, use_sessions=True, debug=False) engine_scores = scores['MMT'] if isinstance(engine_scores, str): raise RuntimeError(engine_scores) bleu = engine_scores['bleu'] print(sample, '%.2f' % (bleu * 100))
def encode(self, corpora, dest_folder): if not os.path.isdir(dest_folder): fileutils.makedirs(dest_folder, exist_ok=True) for corpus in corpora: for lang in corpus.langs: source = corpus.get_file(lang) dest = ParallelCorpus(corpus.name, dest_folder, [lang]).get_file(lang) self.encode_file(source, dest) return ParallelCorpus.list(dest_folder)
def process(self, corpora, dest_folder, print_tags=True, print_placeholders=False, original_spacing=False): for corpus in corpora: for lang in corpus.langs: source = corpus.get_file(lang) dest = ParallelCorpus(corpus.name, dest_folder, [lang]).get_file(lang) self.__process_file(source, dest, lang, print_tags, print_placeholders, original_spacing) return ParallelCorpus.list(dest_folder)
def tune(self, corpora=None, tokenize=True, debug=False, context_enabled=True): if corpora is None: corpora = ParallelCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if not self.is_running(): raise IllegalStateException('No MMT Server running, start the engine first') target_lang = self.engine.target_lang source_lang = self.engine.source_lang cmdlogger = _tuning_logger(4 if tokenize else 3) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: original_corpora = corpora # Tokenization tokenized_corpora = original_corpora if tokenize: tokenizer_output = os.path.join(working_dir, 'tokenized_corpora') fileutils.makedirs(tokenizer_output, exist_ok=True) with cmdlogger.step('Corpus tokenization') as _: tokenized_corpora = self.engine.preprocessor.process(corpora, tokenizer_output, print_tags=False, print_placeholders=True, original_spacing=False) # Create merged corpus with cmdlogger.step('Merging corpus') as _: source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: original_root = original_corpora[0].root for corpus in tokenized_corpora: tokenized = corpus.get_file(source_lang) original = os.path.join(original_root, corpus.name + '.' + source_lang) out.write(tokenized + ':' + original + '\n') target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in tokenized_corpora], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', os.path.join(Moses.bin_path, 'bin'), '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table'] with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir()
def tune(self, corpora=None, tokenize=True, debug=False, context_enabled=True): if corpora is None: corpora = ParallelCorpus.list( os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if not self.is_running(): raise IllegalStateException( 'No MMT Server running, start the engine first') target_lang = self.engine.target_lang source_lang = self.engine.source_lang cmdlogger = _tuning_logger(4 if tokenize else 3) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: original_corpora = corpora # Tokenization tokenized_corpora = original_corpora if tokenize: tokenizer_output = os.path.join(working_dir, 'tokenized_corpora') fileutils.makedirs(tokenizer_output, exist_ok=True) with cmdlogger.step('Corpus tokenization') as _: tokenized_corpora = self.engine.preprocessor.process( corpora, tokenizer_output, print_tags=False, print_placeholders=True, original_spacing=False) # Create merged corpus with cmdlogger.step('Merging corpus') as _: source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: original_root = original_corpora[0].root for corpus in tokenized_corpora: tokenized = corpus.get_file(source_lang) original = os.path.join( original_root, corpus.name + '.' + source_lang) out.write(tokenized + ':' + original + '\n') target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([ corpus.get_file(target_lang) for corpus in tokenized_corpora ], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [ self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', os.path.join(Moses.bin_path, 'bin'), '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table' ] with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [ float(val) for val in tokens[1:] ] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir()