def _preprocess(self, args, skip=False, log=None): preprocessed_folder = self._get_tempdir('preprocessed_corpora') if skip: processed_bicorpora, processed_monocorpora = BilingualCorpus.splitlist( self._engine.source_lang, self._engine.target_lang, roots=preprocessed_folder) else: corpora = args.bilingual_corpora + args.monolingual_corpora if not corpora: raise CorpusNotFoundInFolderException( "Could not find any valid %s -> %s segments in your input." % (self._engine.source_lang, self._engine.target_lang)) processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( corpora, preprocessed_folder, data_path=(self._engine.data_path if self._split_trainingset else None), vb_path=self._engine.vocabulary_path, log=log) args.processed_bilingual_corpora = processed_bicorpora args.processed_monolingual_corpora = processed_monocorpora
def _preprocess(self, args, skip=False, log=None): preprocessed_folder = self._get_tempdir('preprocessed_corpora') train_folder = os.path.join(preprocessed_folder, 'train') valid_folder = os.path.join(preprocessed_folder, 'validation') raw_valid_folder = os.path.join(preprocessed_folder, 'extracted_validation') if skip: args.processed_train_corpora = BilingualCorpus.list( self.source_lang, self.target_lang, train_folder) args.processed_valid_corpora = BilingualCorpus.list( self.source_lang, self.target_lang, valid_folder) else: if not args.corpora: raise CorpusNotFoundInFolderException( 'Could not find any valid %s > %s segments in your input.' % (self.source_lang, self.target_lang)) test_data_path = self._engine.test_data_path if self._split_train else None dev_data_path = raw_valid_folder if self._split_train else None args.processed_train_corpora = self._training_preprocessor.process( args.corpora, train_folder, log=log, test_data_path=test_data_path, dev_data_path=dev_data_path) valid_corpora = BilingualCorpus.list( self.source_lang, self.target_lang, dev_data_path or self._validation_path) if not valid_corpora: raise CorpusNotFoundInFolderException( 'Could not find any valid %s > %s segments for validation.' % (self.source_lang, self.target_lang)) args.processed_valid_corpora = self._training_preprocessor.process( valid_corpora, valid_folder, log=log)
def _build(self, resume, listener): self._temp_dir = self._engine.get_tempdir('training', ensure=(not resume)) checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json') if resume: self._schedule.load(checkpoint_path) else: self._schedule.store(checkpoint_path) source_lang = self._engine.source_lang target_lang = self._engine.target_lang # separate bilingual and monolingual corpora in separate lists, reading them from roots bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist( source_lang, target_lang, roots=self._roots) # if no bilingual corpora are found, it is not possible to train the translation system if len(bilingual_corpora) == 0: raise CorpusNotFoundInFolderException( 'Could not find %s-%s corpora in path %s' % (source_lang.upper(), target_lang.upper(), ', '.join( self._roots))) # if no old engines (i.e. engine folders) can be found, create a new one from scratch # if we are not trying to resume an old one, create from scratch anyway if not os.path.isdir(self._engine.path) or not resume: shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Create a new logger for the building activities, log_file = self._engine.get_logfile('training', append=resume) log_stream = open(log_file, 'ab' if resume else 'wb') logging.basicConfig( format='%(asctime)-15s [%(levelname)s] - %(message)s', level=logging.DEBUG, stream=log_stream) logger = logging.getLogger('EngineBuilder') # Start the engine building (training) phases try: logger.log( logging.INFO, 'Training started: engine=%s, bilingual=%d, monolingual=%d, langpair=%s-%s' % (self._engine.name, len(bilingual_corpora), len(monolingual_corpora), self._engine.source_lang, self._engine.target_lang)) if listener: listener.on_training_begin(self._schedule.visible_steps(), self._engine, bilingual_corpora, monolingual_corpora) # Check if all requirements are fulfilled before actual engine training try: self._check_constraints() except EngineBuilder.HWConstraintViolated as e: if listener: listener.on_hw_constraint_violated(e.cause) args = EngineBuilder.__Args() args.bilingual_corpora = bilingual_corpora args.monolingual_corpora = monolingual_corpora # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~ # Note: if resume is true, a step is only run if it was not in the previous attempt step_index = 1 for method in self._schedule: skip = self._schedule.is_completed(method.id) if listener and not method.is_hidden(): listener.on_step_begin(method.id, method.name) logger.log( logging.INFO, 'Training step "%s" (%d/%d) started' % (method.id, step_index, len(self._schedule))) start_time = time.time() method(self, args, skip=skip, log=log_stream, delete_on_exit=self._delete_on_exit) elapsed_time = time.time() - start_time if listener and not method.is_hidden(): listener.on_step_end(method.id, method.name) logger.log( logging.INFO, 'Training step "%s" completed in %d s' % (method.id, int(elapsed_time))) self._schedule.step_completed(method.id) self._schedule.store(checkpoint_path) step_index += 1 if listener: listener.on_training_end(self._engine) if self._delete_on_exit: self._engine.clear_tempdir('training') except: logger.exception('Unexpected exception') raise finally: log_stream.close()
def _build(self, resume): self._temp_dir = self._engine.get_tempdir('training', ensure=(not resume)) checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json') if resume: self._schedule.load(checkpoint_path) else: self._schedule.store(checkpoint_path) corpora = BilingualCorpus.list(self.source_lang, self.target_lang, self.roots) if len(corpora) == 0: raise CorpusNotFoundInFolderException( 'Could not find %s > %s corpora in path %s' % (self.source_lang, self.target_lang, ', '.join(self.roots))) # if no old engines (i.e. engine folders) can be found, create a new one from scratch # if we are not trying to resume an old one, create from scratch anyway if not os.path.isdir(self._engine.path) or not resume: shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Create a new logger for the building activities, log_file = self._engine.get_logfile('training', append=resume) log_stream = open(log_file, 'ab' if resume else 'wb') logging.basicConfig( format='%(asctime)-15s [%(levelname)s] - %(message)s', level=logging.DEBUG, stream=log_stream) logger = logging.getLogger('EngineBuilder') # Start the engine building (training) phases steps_count = len(self._schedule.visible_steps()) log_line_len = 70 try: logger.log( logging.INFO, 'Training started: engine=%s, corpora=%d, lang_pair=%s-%s' % (self._engine.name, len(corpora), self.source_lang, self.target_lang)) print '\n=========== TRAINING STARTED ===========\n' print 'ENGINE: %s' % self._engine.name print 'CORPORA: %d corpora' % len(corpora) print 'LANGS: %s > %s' % (self.source_lang, self.target_lang) print # Check if all requirements are fulfilled before actual engine training try: self._check_constraints() except EngineBuilder.HWConstraintViolated as e: print '\033[91mWARNING\033[0m: %s\n' % e.cause args = EngineBuilder.__Args() args.corpora = corpora # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~ # Note: if resume is true, a step is only run if it was not in the previous attempt step_index = 1 for method in self._schedule: if not method.is_hidden(): print('INFO: (%d of %d) %s... ' % (step_index, steps_count, method.name)).ljust(log_line_len), skip = self._schedule.is_completed(method.id) self._step_start_time = time.time() logger.log( logging.INFO, 'Training step "%s" (%d/%d) started' % (method.id, step_index, len(self._schedule))) start_time = time.time() method(self, args, skip=skip, log=log_stream, delete_on_exit=self._delete_on_exit) elapsed_time_str = self._pretty_print_time(time.time() - start_time) if not method.is_hidden(): step_index += 1 print 'DONE (in %s)' % elapsed_time_str logger.log( logging.INFO, 'Training step "%s" completed in %s' % (method.id, elapsed_time_str)) self._schedule.step_completed(method.id) self._schedule.store(checkpoint_path) print '\n=========== TRAINING SUCCESS ===========\n' print 'You can now start, stop or check the status of the server with command:' print '\t./mmt start|stop|status ' + ('' if self._engine.name == 'default' else '-e %s' % self._engine.name) print if self._delete_on_exit: self._engine.clear_tempdir('training') except Exception: logger.exception('Unexpected exception') raise finally: log_stream.close()