def _step_init(self, bilingual_corpora, monolingual_corpora, skip=False, logger=None, delete_on_exit=False): training_folder = self._get_tempdir('training_corpora') # if skip is true, then we are in resume mode, so return the already existing results if skip: bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist( self._engine.source_lang, self._engine.target_lang, roots=training_folder) # else perform the baseline domains extraction and domain mapping, and return its result else: domains = self._engine.db.insert(bilingual_corpora) bilingual_corpora = [ domain.corpus.symlink(training_folder, name=str(domain.id)) for domain in domains ] monolingual_corpora = [ corpus.symlink(training_folder) for corpus in monolingual_corpora ] return bilingual_corpora, monolingual_corpora
def _step_preprocess(self, bilingual_corpora, monolingual_corpora, _, skip=False, logger=None, delete_on_exit=False): preprocessed_folder = self._get_tempdir('preprocessed') cleaned_folder = self._get_tempdir('clean_corpora') # if skip is true, then we are in resume mode, so return the already existing results if skip: processed_bicorpora, processed_monocorpora = BilingualCorpus.splitlist( self._engine.source_lang, self._engine.target_lang, roots=preprocessed_folder) cleaned_bicorpora = BilingualCorpus.list(cleaned_folder) else: processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( bilingual_corpora + monolingual_corpora, preprocessed_folder, (self._engine.data_path if self._split_trainingset else None), log=logger.stream) cleaned_bicorpora = self._engine.training_preprocessor.clean( processed_bicorpora, cleaned_folder) return processed_bicorpora, processed_monocorpora, cleaned_bicorpora
def process(self, corpora, output_path, data_path=None): args = [ '-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path, '--input' ] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append( os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append( os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
def clean(self, source, target, input_paths, output_path): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(source, target, roots=output_path)[0]
def _make_training_folder(self, bilingual_corpora, monolingual_corpora, domains, folder): for corpus in bilingual_corpora: dest_corpus = BilingualCorpus.make_parallel(domains[corpus.name], folder, corpus.langs) for lang in corpus.langs: os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang)) for corpus in monolingual_corpora: dest_corpus = BilingualCorpus.make_parallel(corpus.name, folder, corpus.langs) for lang in corpus.langs: os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang)) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=folder)
def process(self, source, target, input_paths, output_path, data_path=None): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(source, target, roots=output_path)
def process(self, corpora, output_path, data_path=None): args = ['-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
def _build(self, resume=False): self._temp_dir = self._engine.get_tempdir('training', ensure=(not resume)) self._checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json') self._passed_steps = [] if resume: self.load_checkpoint() else: self.save_checkpoint() # initialize thee checkpoint manager source_lang = self._engine.source_lang target_lang = self._engine.target_lang # separate bilingual and monolingual corpora in separate lists, reading them from roots bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist( source_lang, target_lang, roots=self._roots) # if no bilingual corpora are found, it is not possible to train the translation system if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) # if no old engines (i.e. engine folders) can be found, create a new one from scratch # if we are not trying to resume an old one, create from scratch anyway if not os.path.isdir(self._engine.path) or not resume: shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check if all requirements are fulfilled before launching engine training self._check_constraints() # Create a new logger for the building activities, # passing it the amount of steps to perform (plus a non user-decidable step) # and the name of the log file to create logger = _builder_logger( len(self._scheduled_steps) + 1, self._engine.get_logfile('training')) delete_on_exit = not self._debug # Start the engine building (training) phases try: # tell the logger that the engine training has started logger.start(self._engine, bilingual_corpora, monolingual_corpora) # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~ # Note: if resume is true, a step is only run if it was not in the previous attempt # run tm_cleanup step on the bilingual_corpora if required. # Obtain cleaned bicorpora cleaned_bicorpora = self._run_step('tm_cleanup', self._step_tm_cleanup, logger=logger, values=[bilingual_corpora], delete_on_exit=delete_on_exit) # run __db_map step (always: user can't skip it) # on the cleaned bicorpora and the original monocorpora; # obtain base bicorpora and base monocorpora base_bicorpora, base_monocorpora = self._run_step( '__db_map', self._step_init, forced=True, values=[cleaned_bicorpora, monolingual_corpora], delete_on_exit=delete_on_exit) # run preprocess step if required. # Return processed bi and mono corpora and cleaned bicorpora processed_bicorpora, processed_monocorpora, cleaned_bicorpora = \ self._run_step('preprocess', self._step_preprocess, logger=logger, values=[base_bicorpora, base_monocorpora, base_bicorpora], delete_on_exit=delete_on_exit) # run context_analyzer step base_bicorpora if required. _ = self._run_step('context_analyzer', self._step_context_analyzer, logger=logger, values=[base_bicorpora], delete_on_exit=delete_on_exit) # run aligner step cleaned_bicorpora if required. _ = self._run_step('aligner', self._step_aligner, logger=logger, values=[cleaned_bicorpora], delete_on_exit=delete_on_exit) # run tm step cleaned_bicorpora if required. _ = self._run_step('tm', self._step_tm, logger=logger, values=[cleaned_bicorpora], delete_on_exit=delete_on_exit) # run lm step on the joint list of processed_bicorpora and processed_monocorpora _ = self._run_step( 'lm', self._step_lm, logger=logger, values=[processed_bicorpora + processed_monocorpora], delete_on_exit=delete_on_exit) # Writing config file with logger.step('Writing config files') as _: self._engine.write_configs() # tell the logger that the engine training has completed logger.completed() # if this is not debug mode, then the training temporary folder must be deleted if not self._debug: self._engine.clear_tempdir('training') except: logger.error() raise finally: logger.close()
def build(self, roots, debug=False, steps=None, split_trainingset=True): self._temp_dir = self._engine.get_tempdir('training', ensure=True) source_lang = self._engine.source_lang target_lang = self._engine.target_lang bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(source_lang, target_lang, roots=roots) if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) if steps is None: steps = self._engine.training_steps else: unknown_steps = [step for step in steps if step not in self._engine.training_steps] if len(unknown_steps) > 0: raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps)) cmdlogger = _builder_logger(len(steps) + 1) cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora) shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check disk space constraints free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / (350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print try: unprocessed_bicorpora = bilingual_corpora unprocessed_monocorpora = monolingual_corpora # TM draft-translations cleanup if 'tm_cleanup' in steps: with cmdlogger.step('TMs clean-up') as _: unprocessed_bicorpora = self._engine.cleaner.clean( unprocessed_bicorpora, self._get_tempdir('clean_tms') ) cleaned_bicorpora = unprocessed_bicorpora processed_bicorpora = unprocessed_bicorpora processed_monocorpora = unprocessed_monocorpora # Preprocessing if 'preprocess' in steps: with cmdlogger.step('Corpora preprocessing') as _: unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate( unprocessed_bicorpora, unprocessed_monocorpora, self._get_tempdir('training_corpora') ) processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( unprocessed_bicorpora + unprocessed_monocorpora, self._get_tempdir('preprocessed'), (self._engine.data_path if split_trainingset else None) ) cleaned_bicorpora = self._engine.training_preprocessor.clean( processed_bicorpora, self._get_tempdir('clean_corpora') ) # Training Context Analyzer if 'context_analyzer' in steps: with cmdlogger.step('Context Analyzer training') as _: log_file = self._engine.get_logfile('training.context') self._engine.analyzer.create_index(unprocessed_bicorpora, source_lang, log_file=log_file) # Aligner if 'aligner' in steps: with cmdlogger.step('Aligner training') as _: log_file = self._engine.get_logfile('training.aligner') working_dir = self._get_tempdir('aligner') self._engine.aligner.build(cleaned_bicorpora, working_dir, log_file) # Training Translation Model if 'tm' in steps: with cmdlogger.step('Translation Model training') as _: working_dir = self._get_tempdir('tm') log_file = self._engine.get_logfile('training.tm') self._engine.pt.train(cleaned_bicorpora, self._engine.aligner, working_dir, log_file) # Training Adaptive Language Model if 'lm' in steps: with cmdlogger.step('Language Model training') as _: working_dir = self._get_tempdir('lm') log_file = self._engine.get_logfile('training.lm') self._engine.lm.train(processed_bicorpora + processed_monocorpora, target_lang, working_dir, log_file) # Writing config file with cmdlogger.step('Writing config files') as _: self._engine.write_configs() cmdlogger.completed() finally: if not debug: self._engine.clear_tempdir('training')
def build(self, roots, debug=False, steps=None, split_trainingset=True): self._temp_dir = self._engine.get_tempdir('training', ensure=True) source_lang = self._engine.source_lang target_lang = self._engine.target_lang bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist( source_lang, target_lang, roots=roots) if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) if steps is None: steps = self._engine.training_steps else: unknown_steps = [ step for step in steps if step not in self._engine.training_steps ] if len(unknown_steps) > 0: raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps)) shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check disk space constraints free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / ( 350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print logger = _builder_logger( len(steps) + 1, self._engine.get_logfile('training')) try: logger.start(self._engine, bilingual_corpora, monolingual_corpora) unprocessed_bicorpora = bilingual_corpora unprocessed_monocorpora = monolingual_corpora # TM draft-translations cleanup if 'tm_cleanup' in steps: with logger.step('TMs clean-up') as _: unprocessed_bicorpora = self._engine.cleaner.clean( unprocessed_bicorpora, self._get_tempdir('clean_tms'), log=logger.stream) cleaned_bicorpora = unprocessed_bicorpora processed_bicorpora = unprocessed_bicorpora processed_monocorpora = unprocessed_monocorpora # Preprocessing if 'preprocess' in steps: with logger.step('Corpora preprocessing') as _: unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate( unprocessed_bicorpora, unprocessed_monocorpora, self._get_tempdir('training_corpora'), log=logger.stream) processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( unprocessed_bicorpora + unprocessed_monocorpora, self._get_tempdir('preprocessed'), (self._engine.data_path if split_trainingset else None), log=logger.stream) cleaned_bicorpora = self._engine.training_preprocessor.clean( processed_bicorpora, self._get_tempdir('clean_corpora')) # Training Context Analyzer if 'context_analyzer' in steps: with logger.step('Context Analyzer training') as _: self._engine.analyzer.create_index(unprocessed_bicorpora, log=logger.stream) # Aligner if 'aligner' in steps: with logger.step('Aligner training') as _: working_dir = self._get_tempdir('aligner') self._engine.aligner.build(cleaned_bicorpora, working_dir, log=logger.stream) # Training Translation Model if 'tm' in steps: with logger.step('Translation Model training') as _: working_dir = self._get_tempdir('tm') self._engine.pt.train(cleaned_bicorpora, self._engine.aligner, working_dir, log=logger.stream) # Training Adaptive Language Model if 'lm' in steps: with logger.step('Language Model training') as _: working_dir = self._get_tempdir('lm') self._engine.lm.train(processed_bicorpora + processed_monocorpora, target_lang, working_dir, log=logger.stream) # Writing config file with logger.step('Writing config files') as _: self._engine.write_configs() logger.completed() except: logger.error() raise finally: logger.close() if not debug: self._engine.clear_tempdir('training')