def _step_preprocess(self, bilingual_corpora, monolingual_corpora, _, skip=False, logger=None, delete_on_exit=False): preprocessed_folder = self._get_tempdir('preprocessed') cleaned_folder = self._get_tempdir('clean_corpora') # if skip is true, then we are in resume mode, so return the already existing results if skip: processed_bicorpora, processed_monocorpora = BilingualCorpus.splitlist( self._engine.source_lang, self._engine.target_lang, roots=preprocessed_folder) cleaned_bicorpora = BilingualCorpus.list(cleaned_folder) else: processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( bilingual_corpora + monolingual_corpora, preprocessed_folder, (self._engine.data_path if self._split_trainingset else None), log=logger.stream) cleaned_bicorpora = self._engine.training_preprocessor.clean( processed_bicorpora, cleaned_folder) return processed_bicorpora, processed_monocorpora, cleaned_bicorpora
def process(self, corpora, dest_folder, print_tags=True, print_placeholders=False, original_spacing=False): for corpus in corpora: for lang in corpus.langs: source = corpus.get_file(lang) dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]) self.__process_file(source, dest, lang, print_tags, print_placeholders, original_spacing) return BilingualCorpus.list(dest_folder)
def process_corpora(self, corpora, dest_folder): fileutils.makedirs(dest_folder, exist_ok=True) for corpus in corpora: for lang in corpus.langs: source = corpus.get_file(lang) dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]) self.process_file(source, dest, lang) return BilingualCorpus.list(dest_folder)
def encode(self, corpora, dest_folder): if not os.path.isdir(dest_folder): fileutils.makedirs(dest_folder, exist_ok=True) for corpus in corpora: for lang in corpus.langs: source = corpus.get_file(lang) dest_file = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]).get_file(lang) self.encode_file(source, dest_file, delete_nl=True) return BilingualCorpus.list(dest_folder)
def _make_training_folder(self, bilingual_corpora, monolingual_corpora, domains, folder): for corpus in bilingual_corpora: dest_corpus = BilingualCorpus.make_parallel(domains[corpus.name], folder, corpus.langs) for lang in corpus.langs: os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang)) for corpus in monolingual_corpora: dest_corpus = BilingualCorpus.make_parallel(corpus.name, folder, corpus.langs) for lang in corpus.langs: os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang)) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=folder)
def main_sweep(argv): parser = argparse.ArgumentParser(description='Sweep SA sample size and measure BLEU scores at various settings.') parser.add_argument('-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default=None) parser.add_argument('--path', dest='corpora_path', metavar='CORPORA', default=None, help='the path to the test corpora (default is the automatically splitted sample)') args = parser.parse_args(argv) samples = [int(e) for e in '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'.split()] injector = dependency.Injector() #injector.read_args(args) engine = MMTEngine(args.engine) injector.inject(engine) node = ClusterNode(engine, api_port=DEFAULT_MMT_API_PORT) # more or less copy-pasted from mmt evaluate: evaluator = Evaluator(node, google_key='1234', use_sessions=True) corpora = BilingualCorpus.list(args.corpora_path) if args.corpora_path is not None \ else BilingualCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) lines = 0 for corpus in corpora: lines += corpus.count_lines() # end copy-paste print('sample bleu') for sample in samples: node.engine.set_config_option('suffixarrays', 'sample', sample) injector.read_config(node.engine.config) # to get engine.set() to affect MosesFeatures -> moses.ini injector.inject(node.engine) node.engine.write_configs() node.restart() scores = evaluator.evaluate(corpora=corpora, heval_output=None, debug=False) engine_scores = [r for r in scores if r.id == 'MMT'][0] if engine_scores.error: raise RuntimeError(engine_scores.error) bleu = engine_scores.bleu print(sample, '%.2f' % (bleu * 100))
def translate(self, corpora, output): """ Translate the given corpora in parallel processing fashion. :param corpora: list of ParallelCorpus :param output: path to output directory :return: ([ParallelCorpus, ...], time_per_sentence, parallelism) """ pool = multithread.Pool(self._threads) try: translations = [] start_time = datetime.now() for corpus in corpora: self._before_translate(corpus) with open(corpus.get_file(self.source_lang)) as source: output_path = os.path.join( output, corpus.name + '.' + self.target_lang) for line in source: translation = pool.apply_async(self._get_translation, (line, corpus)) translations.append((translation, output_path)) self._after_translate(corpus) elapsed_time = 0 translation_count = 0 path = None stream = None for translation_job, output_path in translations: translation, elapsed = translation_job.get() if output_path != path: if stream is not None: stream.close() stream = open(output_path, 'wb') path = output_path stream.write(translation.encode('utf-8')) stream.write('\n') elapsed_time += elapsed translation_count += 1 if stream is not None: stream.close() end_time = datetime.now() total_time = end_time - start_time return BilingualCorpus.list(output), ( elapsed_time / translation_count), (elapsed_time / total_time.total_seconds()) finally: pool.terminate()
def process(self, corpora, output_path, data_path=None): args = [ '-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path, '--input' ] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append( os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append( os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
def build(self, corpora, working_dir='.', log=None): if log is None: log = shell.DEVNULL shutil.rmtree(self._model, ignore_errors=True) fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) merged_corpus = BilingualCorpus.make_parallel( 'merge', working_dir, (self._source_lang, self._target_lang)) fileutils.merge( [corpus.get_file(self._source_lang) for corpus in corpora], merged_corpus.get_file(self._source_lang)) fileutils.merge( [corpus.get_file(self._target_lang) for corpus in corpora], merged_corpus.get_file(self._target_lang)) command = [ self._build_bin, '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang), '-m', self._model, '-I', '4' ] shell.execute(command, stdout=log, stderr=log)
def train(self, corpora, aligner, working_dir='.', log=None): if log is None: log = shell.DEVNULL if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) train_corpora = [] # Prepare training folder for corpus in corpora: dest_corpus = BilingualCorpus.make_parallel(corpus.name, working_dir, (self._source_lang, self._target_lang)) source_file = corpus.get_file(self._source_lang) target_file = corpus.get_file(self._target_lang) os.symlink(source_file, dest_corpus.get_file(self._source_lang)) os.symlink(target_file, dest_corpus.get_file(self._target_lang)) train_corpora.append(dest_corpus) # Align corpora aligner.align(train_corpora, working_dir, log=log) # Build models command = [self._build_bin, '--input', working_dir, '--model', self._model, '-s', self._source_lang, '-t', self._target_lang] shell.execute(command, stdout=log, stderr=log)
def train(self, corpora, aligner, working_dir='.', log_file=None): if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Prepare training folder for corpus in corpora: dest_corpus = BilingualCorpus.make_parallel(corpus.name, working_dir, (self._source_lang, self._target_lang)) source_file = corpus.get_file(self._source_lang) target_file = corpus.get_file(self._target_lang) os.symlink(source_file, dest_corpus.get_file(self._source_lang)) os.symlink(target_file, dest_corpus.get_file(self._target_lang)) aligner.align(corpus, os.path.join(working_dir, corpus.name + '.align')) # Build models command = [self._build_bin, '--input', working_dir, '--model', self._model, '-s', self._source_lang, '-t', self._target_lang] shell.execute(command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def build(self, corpora, working_dir='.', log_file=None): if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) merged_corpus = BilingualCorpus.make_parallel( 'merge', working_dir, (self._source_lang, self._target_lang)) fileutils.merge( [corpus.get_file(self._source_lang) for corpus in corpora], merged_corpus.get_file(self._source_lang)) fileutils.merge( [corpus.get_file(self._target_lang) for corpus in corpora], merged_corpus.get_file(self._target_lang)) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Train model command = [ self._build_bin, '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang), '-m', self._model, '-I', '4' ] shell.execute(command, stderr=log) finally: if log_file is not None: log.close()
def _step_init(self, bilingual_corpora, monolingual_corpora, skip=False, logger=None, delete_on_exit=False): training_folder = self._get_tempdir('training_corpora') # if skip is true, then we are in resume mode, so return the already existing results if skip: bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist( self._engine.source_lang, self._engine.target_lang, roots=training_folder) # else perform the baseline domains extraction and domain mapping, and return its result else: domains = self._engine.db.insert(bilingual_corpora) bilingual_corpora = [ domain.corpus.symlink(training_folder, name=str(domain.id)) for domain in domains ] monolingual_corpora = [ corpus.symlink(training_folder) for corpus in monolingual_corpora ] return bilingual_corpora, monolingual_corpora
def build(self, corpora, working_dir='.', log_file=None): if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) merged_corpus = BilingualCorpus.make_parallel('merge', working_dir, (self._source_lang, self._target_lang)) fileutils.merge([corpus.get_file(self._source_lang) for corpus in corpora], merged_corpus.get_file(self._source_lang)) fileutils.merge([corpus.get_file(self._target_lang) for corpus in corpora], merged_corpus.get_file(self._target_lang)) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Train model command = [self._build_bin, '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang), '-m', self._model, '-I', '4'] shell.execute(command, stderr=log) finally: if log_file is not None: log.close()
def translate(self, corpora, output): """ Translate the given corpora in parallel processing fashion. :param corpora: list of ParallelCorpus :param output: path to output directory :return: ([ParallelCorpus, ...], time_per_sentence, parallelism) """ pool = multithread.Pool(self._threads) try: translations = [] start_time = datetime.now() for corpus in corpora: self._before_translate(corpus) with open(corpus.get_file(self.source_lang)) as source: output_path = os.path.join(output, corpus.name + '.' + self.target_lang) for line in source: translation = pool.apply_async(self._get_translation, (line, corpus)) translations.append((translation, output_path)) self._after_translate(corpus) elapsed_time = 0 translation_count = 0 path = None stream = None for translation_job, output_path in translations: translation, elapsed = translation_job.get() if output_path != path: if stream is not None: stream.close() stream = open(output_path, 'wb') path = output_path stream.write(translation.encode('utf-8')) stream.write('\n') elapsed_time += elapsed translation_count += 1 if stream is not None: stream.close() end_time = datetime.now() total_time = end_time - start_time return BilingualCorpus.list(output), (elapsed_time / translation_count), ( elapsed_time / total_time.total_seconds()) finally: pool.terminate()
def clean(self, source, target, input_paths, output_path): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(source, target, roots=output_path)[0]
def main_sweep(argv): parser = argparse.ArgumentParser(description='Sweep SA sample size and measure BLEU scores at various settings.') parser.add_argument('-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default=None) parser.add_argument('--path', dest='corpora_path', metavar='CORPORA', default=None, help='the path to the test corpora (default is the automatically splitted sample)') args = parser.parse_args(argv) samples = [int(e) for e in '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'.split()] node = ConfiguredClusterNode(args.engine) # more or less copy-pasted from mmt evaluate: evaluator = Evaluator(node.engine, node) corpora = BilingualCorpus.list(args.corpora_path) if args.corpora_path is not None \ else BilingualCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) lines = 0 for corpus in corpora: lines += corpus.count_lines() # end copy-paste print('sample bleu') for sample in samples: node.set('suffixarrays', 'sample', sample) node.apply_configs() scores = evaluator.evaluate(corpora=corpora, google_key='1234', heval_output=None, use_sessions=True, debug=False) engine_scores = scores['MMT'] if isinstance(engine_scores, str): raise RuntimeError(engine_scores) bleu = engine_scores['bleu'] print(sample, '%.2f' % (bleu * 100))
def clean(self, corpora, output_path): args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.list(output_path)
def clean(self, corpora, output_path, log=None): if log is None: log = shell.DEVNULL args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdout=log, stderr=log) return BilingualCorpus.list(output_path)
def process(self, source, target, input_paths, output_path, data_path=None): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(source, target, roots=output_path)
def _step_tm_cleanup(self, corpora, skip=False, logger=None, delete_on_exit=True): # the folder where tm_cleanup results are to be stored folder = self._get_tempdir('clean_tms') # if skip is true, then we are in resume mode, so use the already existing results if skip: clean_tms = BilingualCorpus.list(folder) # else perform the cleaning on the corpora and use the clean corpora else: clean_tms = self._engine.cleaner.clean(corpora, folder, log=logger.stream) return clean_tms
def process(self, corpora, output_path, data_path=None): args = ['-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
def train(self, corpora, aligner, working_dir='.', log_file=None): if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Prepare training folder for corpus in corpora: dest_corpus = BilingualCorpus.make_parallel( corpus.name, working_dir, (self._source_lang, self._target_lang)) source_file = corpus.get_file(self._source_lang) target_file = corpus.get_file(self._target_lang) os.symlink(source_file, dest_corpus.get_file(self._source_lang)) os.symlink(target_file, dest_corpus.get_file(self._target_lang)) aligner.align( corpus, os.path.join(working_dir, corpus.name + '.align')) # Build models command = [ self._build_bin, '--input', working_dir, '--model', self._model, '-s', self._source_lang, '-t', self._target_lang ] shell.execute(command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def clean(self, corpora, output_path, log=None): if log is None: log = shell.DEVNULL # read memory size mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') # e.g. 4015976448 mem_mb = mem_bytes / (1024. ** 2) # e.g. 3.74 extended_heap_mb = int(mem_mb*90/100) args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args=args, max_heap_mb=extended_heap_mb) shell.execute(command, stdout=log, stderr=log) return BilingualCorpus.list(output_path)
def clean(self, corpora, dest_folder): langs = (self._source_lang, self._target_lang) _pool_exec(self._clean_file, [(corpus, dest_folder, langs) for corpus in corpora]) return BilingualCorpus.list(dest_folder)
def clean(self, corpora, dest_folder, langs=None): if langs is None and len(corpora) > 0: langs = (corpora[0].langs[0], corpora[0].langs[1]) self._pool_exec(self._clean_file, [(corpus, dest_folder, langs) for corpus in corpora]) return BilingualCorpus.list(dest_folder)
def _build(self, resume=False): self._temp_dir = self._engine.get_tempdir('training', ensure=(not resume)) self._checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json') self._passed_steps = [] if resume: self.load_checkpoint() else: self.save_checkpoint() # initialize thee checkpoint manager source_lang = self._engine.source_lang target_lang = self._engine.target_lang # separate bilingual and monolingual corpora in separate lists, reading them from roots bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist( source_lang, target_lang, roots=self._roots) # if no bilingual corpora are found, it is not possible to train the translation system if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) # if no old engines (i.e. engine folders) can be found, create a new one from scratch # if we are not trying to resume an old one, create from scratch anyway if not os.path.isdir(self._engine.path) or not resume: shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check if all requirements are fulfilled before launching engine training self._check_constraints() # Create a new logger for the building activities, # passing it the amount of steps to perform (plus a non user-decidable step) # and the name of the log file to create logger = _builder_logger( len(self._scheduled_steps) + 1, self._engine.get_logfile('training')) delete_on_exit = not self._debug # Start the engine building (training) phases try: # tell the logger that the engine training has started logger.start(self._engine, bilingual_corpora, monolingual_corpora) # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~ # Note: if resume is true, a step is only run if it was not in the previous attempt # run tm_cleanup step on the bilingual_corpora if required. # Obtain cleaned bicorpora cleaned_bicorpora = self._run_step('tm_cleanup', self._step_tm_cleanup, logger=logger, values=[bilingual_corpora], delete_on_exit=delete_on_exit) # run __db_map step (always: user can't skip it) # on the cleaned bicorpora and the original monocorpora; # obtain base bicorpora and base monocorpora base_bicorpora, base_monocorpora = self._run_step( '__db_map', self._step_init, forced=True, values=[cleaned_bicorpora, monolingual_corpora], delete_on_exit=delete_on_exit) # run preprocess step if required. # Return processed bi and mono corpora and cleaned bicorpora processed_bicorpora, processed_monocorpora, cleaned_bicorpora = \ self._run_step('preprocess', self._step_preprocess, logger=logger, values=[base_bicorpora, base_monocorpora, base_bicorpora], delete_on_exit=delete_on_exit) # run context_analyzer step base_bicorpora if required. _ = self._run_step('context_analyzer', self._step_context_analyzer, logger=logger, values=[base_bicorpora], delete_on_exit=delete_on_exit) # run aligner step cleaned_bicorpora if required. _ = self._run_step('aligner', self._step_aligner, logger=logger, values=[cleaned_bicorpora], delete_on_exit=delete_on_exit) # run tm step cleaned_bicorpora if required. _ = self._run_step('tm', self._step_tm, logger=logger, values=[cleaned_bicorpora], delete_on_exit=delete_on_exit) # run lm step on the joint list of processed_bicorpora and processed_monocorpora _ = self._run_step( 'lm', self._step_lm, logger=logger, values=[processed_bicorpora + processed_monocorpora], delete_on_exit=delete_on_exit) # Writing config file with logger.step('Writing config files') as _: self._engine.write_configs() # tell the logger that the engine training has completed logger.completed() # if this is not debug mode, then the training temporary folder must be deleted if not self._debug: self._engine.clear_tempdir('training') except: logger.error() raise finally: logger.close()
def train(self, corpora, aligner, working_dir='.', log_file=None): if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) l1 = self._source_lang l2 = self._target_lang langs = (l1, l2) langs_suffix = l1 + '-' + l2 mct_base = self._get_model_basename() dmp_file = mct_base + '.dmp' mam_file = mct_base + '.' + langs_suffix + '.mam' lex_file = mct_base + '.' + langs_suffix + '.lex' log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Clean corpus for training clean_output = os.path.join(working_dir, 'clean_corpora') fileutils.makedirs(clean_output, exist_ok=True) corpora = self._cleaner.clean(corpora, clean_output, (self._source_lang, self._target_lang)) # Create merged corpus and domains list file (dmp) merged_corpus = BilingualCorpus.make_parallel(os.path.basename(mct_base), working_dir, langs) fileutils.merge([corpus.get_file(l1) for corpus in corpora], merged_corpus.get_file(l1)) fileutils.merge([corpus.get_file(l2) for corpus in corpora], merged_corpus.get_file(l2)) with open(dmp_file, 'w') as dmp: for corpus in corpora: dmp.write(str(corpus.name) + ' ' + str(corpus.count_lines()) + '\n') # Create alignments in 'bal' file and symmetrize bal_file = aligner.align(merged_corpus, langs, self._model, working_dir, log_file) symal_file = os.path.join(working_dir, 'alignments.' + langs_suffix + '.symal') symal_command = [self._symal_bin, '-a=g', '-d=yes', '-f=yes', '-b=yes'] with open(bal_file) as stdin: with open(symal_file, 'w') as stdout: shell.execute(symal_command, stdin=stdin, stdout=stdout, stderr=log) # Execute mtt-build mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l1) with open(merged_corpus.get_file(l1)) as stdin: shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log) mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l2) with open(merged_corpus.get_file(l2)) as stdin: shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log) # Create 'mam' file mam_command = [self._symal2mam_bin, mam_file] with open(symal_file) as stdin: shell.execute(mam_command, stdin=stdin, stdout=log, stderr=log) # Create 'lex' file lex_command = [self._mmlexbuild_bin, mct_base + '.', l1, l2, '-o', lex_file] shell.execute(lex_command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25, early_stopping_value=None): if corpora is None: corpora = BilingualCorpus.list( os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) target_lang = self.engine.target_lang source_lang = self.engine.source_lang corpora = [ corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs ] if len(corpora) == 0: raise IllegalArgumentException( 'No %s > %s corpora found into specified path' % (source_lang, target_lang)) source_corpora = [ BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang]) for corpus in corpora ] reference_corpora = [ BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang]) for corpus in corpora ] cmdlogger = _tuning_logger(4) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: # Tokenization tokenizer = Tokenizer(target_lang) tokenized_output = os.path.join(working_dir, 'reference_corpora') fileutils.makedirs(tokenized_output, exist_ok=True) with cmdlogger.step('Corpora tokenization') as _: reference_corpora = tokenizer.process_corpora( reference_corpora, tokenized_output) # Create merged corpus with cmdlogger.step('Merging corpus') as _: # source source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: for corpus in source_corpora: out.write(corpus.get_file(source_lang) + '\n') # target target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([ corpus.get_file(target_lang) for corpus in reference_corpora ], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if self.api.root is not None: decoder_flags += ['--root', self.api.root] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [ self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR, '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table' ] if early_stopping_value is not None: command += [ '--bleuscorer', self._scorer_script, '--bleuscorer-flags "-nt" --early-stopping-value %d' % early_stopping_value ] if not random_seeds: command.append('--predictable-seeds') if max_iterations > 0: command.append('--maximum-iterations={num}'.format( num=max_iterations)) with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [ float(val) for val in tokens[1:] ] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir("tuning")
def build(self, roots, debug=False, steps=None, split_trainingset=True): self._temp_dir = self._engine.get_tempdir('training', ensure=True) source_lang = self._engine.source_lang target_lang = self._engine.target_lang bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(source_lang, target_lang, roots=roots) if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) if steps is None: steps = self._engine.training_steps else: unknown_steps = [step for step in steps if step not in self._engine.training_steps] if len(unknown_steps) > 0: raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps)) cmdlogger = _builder_logger(len(steps) + 1) cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora) shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check disk space constraints free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / (350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print try: unprocessed_bicorpora = bilingual_corpora unprocessed_monocorpora = monolingual_corpora # TM draft-translations cleanup if 'tm_cleanup' in steps: with cmdlogger.step('TMs clean-up') as _: unprocessed_bicorpora = self._engine.cleaner.clean( unprocessed_bicorpora, self._get_tempdir('clean_tms') ) cleaned_bicorpora = unprocessed_bicorpora processed_bicorpora = unprocessed_bicorpora processed_monocorpora = unprocessed_monocorpora # Preprocessing if 'preprocess' in steps: with cmdlogger.step('Corpora preprocessing') as _: unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate( unprocessed_bicorpora, unprocessed_monocorpora, self._get_tempdir('training_corpora') ) processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( unprocessed_bicorpora + unprocessed_monocorpora, self._get_tempdir('preprocessed'), (self._engine.data_path if split_trainingset else None) ) cleaned_bicorpora = self._engine.training_preprocessor.clean( processed_bicorpora, self._get_tempdir('clean_corpora') ) # Training Context Analyzer if 'context_analyzer' in steps: with cmdlogger.step('Context Analyzer training') as _: log_file = self._engine.get_logfile('training.context') self._engine.analyzer.create_index(unprocessed_bicorpora, source_lang, log_file=log_file) # Aligner if 'aligner' in steps: with cmdlogger.step('Aligner training') as _: log_file = self._engine.get_logfile('training.aligner') working_dir = self._get_tempdir('aligner') self._engine.aligner.build(cleaned_bicorpora, working_dir, log_file) # Training Translation Model if 'tm' in steps: with cmdlogger.step('Translation Model training') as _: working_dir = self._get_tempdir('tm') log_file = self._engine.get_logfile('training.tm') self._engine.pt.train(cleaned_bicorpora, self._engine.aligner, working_dir, log_file) # Training Adaptive Language Model if 'lm' in steps: with cmdlogger.step('Language Model training') as _: working_dir = self._get_tempdir('lm') log_file = self._engine.get_logfile('training.lm') self._engine.lm.train(processed_bicorpora + processed_monocorpora, target_lang, working_dir, log_file) # Writing config file with cmdlogger.step('Writing config files') as _: self._engine.write_configs() cmdlogger.completed() finally: if not debug: self._engine.clear_tempdir('training')
def main_sweep(argv): parser = argparse.ArgumentParser( description= 'Sweep SA sample size and measure BLEU scores at various settings.') parser.add_argument( '-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default=None) parser.add_argument( '--path', dest='corpora_path', metavar='CORPORA', default=None, help= 'the path to the test corpora (default is the automatically splitted sample)' ) args = parser.parse_args(argv) samples = [ int(e) for e in '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'. split() ] injector = dependency.DependencyInjector() #injector.read_args(args) engine = MMTEngine(args.engine) injector.inject(engine) node = ClusterNode(engine, api_port=DEFAULT_MMT_API_PORT) # more or less copy-pasted from mmt evaluate: evaluator = Evaluator(node, google_key='1234', use_sessions=True) corpora = BilingualCorpus.list(args.corpora_path) if args.corpora_path is not None \ else BilingualCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) lines = 0 for corpus in corpora: lines += corpus.count_lines() # end copy-paste print('sample bleu') for sample in samples: node.engine.set_config_option('suffixarrays', 'sample', sample) injector.read_config( node.engine.config ) # to get engine.set() to affect MosesFeatures -> moses.ini injector.inject(node.engine) node.engine.write_configs() node.restart() scores = evaluator.evaluate(corpora=corpora, heval_output=None, debug=False) engine_scores = [r for r in scores if r.id == 'MMT'][0] if engine_scores.error: raise RuntimeError(engine_scores.error) bleu = engine_scores.bleu print(sample, '%.2f' % (bleu * 100))
def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25): if corpora is None: corpora = BilingualCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if not self.is_running(): raise IllegalStateException('No MMT Server running, start the engine first') tokenizer = Tokenizer() target_lang = self.engine.target_lang source_lang = self.engine.source_lang source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang]) for corpus in corpora] reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang]) for corpus in corpora] cmdlogger = _tuning_logger(4) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: # Tokenization tokenized_output = os.path.join(working_dir, 'reference_corpora') fileutils.makedirs(tokenized_output, exist_ok=True) with cmdlogger.step('Corpora tokenization') as _: reference_corpora = tokenizer.process_corpora(reference_corpora, tokenized_output) # Create merged corpus with cmdlogger.step('Merging corpus') as _: # source source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: for corpus in source_corpora: out.write(corpus.get_file(source_lang) + '\n') # target target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR, '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table'] if not random_seeds: command.append('--predictable-seeds') if max_iterations > 0: command.append('--maximum-iterations={num}'.format(num=max_iterations)) with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir()
def build(self, roots, debug=False, steps=None, split_trainingset=True): self._temp_dir = self._engine.get_tempdir('training', ensure=True) source_lang = self._engine.source_lang target_lang = self._engine.target_lang bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist( source_lang, target_lang, roots=roots) if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) if steps is None: steps = self._engine.training_steps else: unknown_steps = [ step for step in steps if step not in self._engine.training_steps ] if len(unknown_steps) > 0: raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps)) shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check disk space constraints free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / ( 350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print logger = _builder_logger( len(steps) + 1, self._engine.get_logfile('training')) try: logger.start(self._engine, bilingual_corpora, monolingual_corpora) unprocessed_bicorpora = bilingual_corpora unprocessed_monocorpora = monolingual_corpora # TM draft-translations cleanup if 'tm_cleanup' in steps: with logger.step('TMs clean-up') as _: unprocessed_bicorpora = self._engine.cleaner.clean( unprocessed_bicorpora, self._get_tempdir('clean_tms'), log=logger.stream) cleaned_bicorpora = unprocessed_bicorpora processed_bicorpora = unprocessed_bicorpora processed_monocorpora = unprocessed_monocorpora # Preprocessing if 'preprocess' in steps: with logger.step('Corpora preprocessing') as _: unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate( unprocessed_bicorpora, unprocessed_monocorpora, self._get_tempdir('training_corpora'), log=logger.stream) processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( unprocessed_bicorpora + unprocessed_monocorpora, self._get_tempdir('preprocessed'), (self._engine.data_path if split_trainingset else None), log=logger.stream) cleaned_bicorpora = self._engine.training_preprocessor.clean( processed_bicorpora, self._get_tempdir('clean_corpora')) # Training Context Analyzer if 'context_analyzer' in steps: with logger.step('Context Analyzer training') as _: self._engine.analyzer.create_index(unprocessed_bicorpora, log=logger.stream) # Aligner if 'aligner' in steps: with logger.step('Aligner training') as _: working_dir = self._get_tempdir('aligner') self._engine.aligner.build(cleaned_bicorpora, working_dir, log=logger.stream) # Training Translation Model if 'tm' in steps: with logger.step('Translation Model training') as _: working_dir = self._get_tempdir('tm') self._engine.pt.train(cleaned_bicorpora, self._engine.aligner, working_dir, log=logger.stream) # Training Adaptive Language Model if 'lm' in steps: with logger.step('Language Model training') as _: working_dir = self._get_tempdir('lm') self._engine.lm.train(processed_bicorpora + processed_monocorpora, target_lang, working_dir, log=logger.stream) # Writing config file with logger.step('Writing config files') as _: self._engine.write_configs() logger.completed() except: logger.error() raise finally: logger.close() if not debug: self._engine.clear_tempdir('training')