def process(self, source, target, input_paths, output_path, data_path=None): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append( os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append( os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return ParallelCorpus.splitlist(source, target, roots=output_path)
def clean(self, source, target, input_paths, output_path): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return ParallelCorpus.splitlist(source, target, roots=output_path)[0]
def build(self, roots, debug=False, steps=None, split_trainingset=True): self._temp_dir = self._engine.get_tempdir('training', ensure=True) source_lang = self._engine.source_lang target_lang = self._engine.target_lang bilingual_corpora, monolingual_corpora = ParallelCorpus.splitlist(source_lang, target_lang, roots=roots) if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) if steps is None: steps = self._engine.training_steps else: unknown_steps = [step for step in steps if step not in self._engine.training_steps] if len(unknown_steps) > 0: raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps)) cmdlogger = _builder_logger(len(steps) + 1) cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora) shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check disk space constraints free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / (350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print try: corpora_roots = roots unprocessed_bicorpora = bilingual_corpora unprocessed_mocorpora = monolingual_corpora # TM cleanup if 'tm_cleanup' in steps: with cmdlogger.step('TMs clean-up') as _: cleaned_output = self._get_tempdir('clean_tms') self._engine.cleaner.clean(source_lang, target_lang, roots, cleaned_output) for corpus in monolingual_corpora: cfile = corpus.get_file(target_lang) link = os.path.join(cleaned_output, os.path.basename(cfile)) os.symlink(cfile, link) corpora_roots = [cleaned_output] unprocessed_bicorpora, unprocessed_mocorpora = ParallelCorpus.splitlist(source_lang, target_lang, roots=corpora_roots) # Preprocessing processed_bicorpora = unprocessed_bicorpora processed_mocorpora = unprocessed_mocorpora if 'preprocess' in steps: with cmdlogger.step('Corpora preprocessing') as _: preprocessor_output = self._get_tempdir('preprocessed') processed_bicorpora, processed_mocorpora = self._engine.training_preprocessor.process( source_lang, target_lang, corpora_roots, preprocessor_output, (self._engine.data_path if split_trainingset else None) ) # Training Context Analyzer if 'context_analyzer' in steps: with cmdlogger.step('Context Analyzer training') as _: log_file = self._engine.get_logfile('training.context') self._engine.analyzer.create_index(unprocessed_bicorpora, source_lang, log_file=log_file) # Training Adaptive Language Model (on the target side of all bilingual corpora) if 'lm' in steps: with cmdlogger.step('Language Model training') as _: working_dir = self._get_tempdir('lm') log_file = self._engine.get_logfile('training.lm') self._engine.lm.train(processed_bicorpora + processed_mocorpora, target_lang, working_dir, log_file) # Training Translation Model if 'tm' in steps: with cmdlogger.step('Translation Model training') as _: working_dir = self._get_tempdir('tm') log_file = self._engine.get_logfile('training.tm') self._engine.pt.train(processed_bicorpora, self._engine.aligner, working_dir, log_file) # Writing config file with cmdlogger.step('Writing config files') as _: self._engine.write_configs() cmdlogger.completed() finally: if not debug: self._engine.clear_tempdir('training')
def build(self, roots, debug=False, steps=None, split_trainingset=True): self._temp_dir = self._engine.get_tempdir('training', ensure=True) source_lang = self._engine.source_lang target_lang = self._engine.target_lang bilingual_corpora, monolingual_corpora = ParallelCorpus.splitlist( source_lang, target_lang, roots=roots) if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) if steps is None: steps = self._engine.training_steps else: unknown_steps = [ step for step in steps if step not in self._engine.training_steps ] if len(unknown_steps) > 0: raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps)) cmdlogger = _builder_logger(len(steps) + 1) cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora) shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check disk space constraints free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / ( 350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print try: corpora_roots = roots unprocessed_bicorpora = bilingual_corpora unprocessed_mocorpora = monolingual_corpora # TM cleanup if 'tm_cleanup' in steps: with cmdlogger.step('TMs clean-up') as _: cleaned_output = self._get_tempdir('clean_tms') self._engine.cleaner.clean(source_lang, target_lang, roots, cleaned_output) for corpus in monolingual_corpora: cfile = corpus.get_file(target_lang) link = os.path.join(cleaned_output, os.path.basename(cfile)) os.symlink(cfile, link) corpora_roots = [cleaned_output] unprocessed_bicorpora, unprocessed_mocorpora = ParallelCorpus.splitlist( source_lang, target_lang, roots=corpora_roots) # Preprocessing processed_bicorpora = unprocessed_bicorpora processed_mocorpora = unprocessed_mocorpora if 'preprocess' in steps: with cmdlogger.step('Corpora preprocessing') as _: preprocessor_output = self._get_tempdir('preprocessed') processed_bicorpora, processed_mocorpora = self._engine.training_preprocessor.process( source_lang, target_lang, corpora_roots, preprocessor_output, (self._engine.data_path if split_trainingset else None)) # Training Context Analyzer if 'context_analyzer' in steps: with cmdlogger.step('Context Analyzer training') as _: log_file = self._engine.get_logfile('training.context') self._engine.analyzer.create_index(unprocessed_bicorpora, source_lang, log_file=log_file) # Training Adaptive Language Model (on the target side of all bilingual corpora) if 'lm' in steps: with cmdlogger.step('Language Model training') as _: working_dir = self._get_tempdir('lm') log_file = self._engine.get_logfile('training.lm') self._engine.lm.train( processed_bicorpora + processed_mocorpora, target_lang, working_dir, log_file) # Training Translation Model if 'tm' in steps: with cmdlogger.step('Translation Model training') as _: working_dir = self._get_tempdir('tm') log_file = self._engine.get_logfile('training.tm') self._engine.pt.train(processed_bicorpora, self._engine.aligner, working_dir, log_file) # Writing config file with cmdlogger.step('Writing config files') as _: self._engine.write_configs() cmdlogger.completed() finally: if not debug: self._engine.clear_tempdir('training')