def onerror(e): logger.log("Error sending email") if check_attachment_error(e): logger.log("Trying to send without attachment") es.send_email("email2") else: raise e
def iteration_trainstep(self, iteration_info, grad_mod=None): train_info = 0 # iterate through all the subbatches in a batch, accumulating gradients while True: # process training subbatch loss, output_batch = self.process_batch( next(self.train_iterator), enable_grad=True) # get iterator_info from iterator iterator_info = self.train_iterator.iterator_info() # calculate and accumulate gradients self.calculate_grads(loss) # accumulate batch info train_info += output_batch # log subbatch info if ((iterator_info.batches_seen + int(not self.train_iterator.take_step())) % self.tracker.print_every) == 0: logger.log(indent(iterator_info.subbatch_str(), " "), verbosity=2) # end loop if the iterator says to take a gradient step if self.train_iterator.take_step(): break # record training info iteration_info.set_train_info(train_info) # take a gradient step, with loss summed over all subbatches on all # devices, dividing by the number of instances self.step(grad_mod=grad_mod, denominator=train_info.batch_length)
def onerror(e): if check_attachment_error(e): logger.log(error_message + " Trying to send without attachment") self.email_sender.send_email(str(iteration_info), onfinish=onfinish, onerror=onerror_base) else: onerror_base(e)
def spawn_function(email_sender): seed_state() model = Model() if torch.distributed.is_initialized(): rank = torch.distributed.get_rank() worldsize = torch.distributed.get_world_size() # model = LDDP(model.to('cpu'), worldsize) model = LDDP(model.to(rank), worldsize) tokenizer = Tokenizer( load_vocab('/home/jered/Documents/data/cnn_dataset/vocab', 50000)) postprocessor = Postprocessor() batcher = TrainSummarizationBatcher(tokenizer) val_batcher = TestSummarizationBatcher(tokenizer) val_dataset = SummarizationDataset( '/home/jered/Documents/data/cnn_dataset/preprocessed/val_processed.data' ) train_dataset = SummarizationDataset( '/home/jered/Documents/data/cnn_dataset/preprocessed/val_processed.data' ) # batch_iterator = batcher.batch_iterator(train_dataset, init_indices_iterator(len(train_dataset), batch_size=15, random=True, epochs=2), subbatches=None) batch_iterator = batcher.batch_iterator(train_dataset, init_indices_iterator( len(train_dataset), batch_size=15, random=True, iterations=200), subbatches=None) val_iterator = batcher.batch_iterator( val_dataset, init_indices_iterator(100, batch_size=15, random=True, iterations=len(batch_iterator.indices_iterator)), subbatches=None) optimizer = Adam([p for p in model.parameters()]) tracker = Tracker( print_every=10, checkpoint_folder='test', checkpoint_every=7, copy_checkpoint_every=7) #, email_every=10, email_sender=email_sender) trainer = Trainer(model, postprocessor, optimizer, batch_iterator, val_iterator=val_iterator, tracker=tracker) logger.set_verbosity(2) trainer.train() #, use_pbar=False) if log_bool(): logger.log("\n\nTESTING") val_iterator = batcher.batch_iterator(val_dataset, init_indices_iterator(100, batch_size=15), subbatches=None) tester = Tester(model, postprocessor, val_iterator) tester.test()
def copy_checkpoint(checkpoint, checkpoint_num, onfinish=lambda: logger.log("done copying checkpoint")): stuff = set(os.listdir(checkpoint)) stuff.remove('saved_checkpoints') saved_checkpoint = os.path.join(checkpoint, 'saved_checkpoints', 'checkpoint%i' % checkpoint_num) subprocess.run(["mkdir", saved_checkpoint]) for x in stuff: subprocess.run( ["cp", "-r", os.path.join(checkpoint, x), saved_checkpoint]) onfinish()
def __call__(self, iteration_info): logger.log("Running Supervised Testing") self.model.load_state_dict(self.training_model.state_dict()) results_folder = os.path.join( self.results_folder, 'results_%s' % iteration_info.iterator_info.batches_seen) os.mkdir(results_folder) self.postprocessor.add_output_dir(results_folder) val_indices_iterator = init_indices_iterator(len(self.val_dataset), self.batch_size) val_iterator = self.batcher.batch_iterator( self.val_dataset, val_indices_iterator, subbatches=self.subbatches, num_workers=self.num_workers) tester = Tester(self.model, self.postprocessor, val_iterator) total_output_batch = tester.test() with open(os.path.join(results_folder, 'scores.txt'), 'w') as f: f.write(str(total_output_batch)) if self.email_sender is not None: attachments = self.postprocessor.get_summary_attachment_generator() self.email_sender("Testing is done!\n\n" + str(total_output_batch), attachments=attachments) logger.log("Testing is done!")
def register_iteration(self, iteration_info, trainer): self.iteration_info = iteration_info if dist.is_initialized(): collected = collect_obj_on_rank0( self.iteration_info, ranks=self.iteration_info.iterator_info.subbatches.get_ranks()) if collected is not None: self.iteration_info = sum(collected) else: self.iteration_info = None if log_bool(): if self.recurring_bool(iteration_info, self.print_every): logger.log(str(self.iteration_info)) if len(self.summary_writers) > 0 and\ self.recurring_bool(iteration_info, self.tensorboard_every): self.iteration_info.write_to_tensorboard(self.summary_writers) # save state to file if self.checkpoint_folder is not None\ and self.recurring_bool(iteration_info, self.checkpoint_every): logger.log("saving checkpoint to %s, batches_seen: %i" % (self.checkpoint_folder, iteration_info.iterator_info.batches_seen)) trainer.save_state(self.checkpoint_folder) # copy checkpoint if self.checkpoint_folder is not None\ and self.recurring_bool(iteration_info, self.copy_checkpoint_every): logger.log("copying to checkpoint number %i" % iteration_info.iterator_info.batches_seen) self.copy_checkpoint_in_thread( iteration_info.iterator_info.batches_seen) logger.log("continuing") # email if self.recurring_bool(iteration_info, self.email_every): logger.log("sending email to %s, batches_seen: %i" % (self.email_sender.receiver_email, iteration_info.iterator_info.batches_seen)) attachments = [] if len(self.summary_writers) <= 0 else\ create_tensorboard_attachment_generator( self.tensorboard_folder) onfinish = lambda: logger.log( "Done sending email at %i batches_seen" % iteration_info. iterator_info.batches_seen) error_message = \ "Error sending email at %i batches_seen!" %\ iteration_info.iterator_info.batches_seen def onerror_base(e): logger.log(error_message) raise e def onerror(e): if check_attachment_error(e): logger.log(error_message + " Trying to send without attachment") self.email_sender.send_email(str(iteration_info), onfinish=onfinish, onerror=onerror_base) else: onerror_base(e) self.email_sender(str(iteration_info), attachments=attachments, onfinish=onfinish, onerror=onerror) logger.log("continuing") # update progress bar self.pbar.update()
def copy_checkpoint_in_thread(self, batches_seen): onfinish = lambda: logger.log("done copying to checkpoint number %i" % batches_seen) thread = Thread(target=copy_checkpoint, args=[self.checkpoint_folder, batches_seen, onfinish]) thread.start()
def onerror_base(e): logger.log(error_message) raise e
def log_section(self, i, dynamic_parameters): logger.log("Section %i" % i) logger.log("\tDynamic" + str(dynamic_parameters))
def onerror(e): if check_attachment_error(e): logger.log("Trying to send without attachment") email_sender.send_email(str(total_output_batch)) else: default_onerror(e)
def default_onerror(e): logger.log("Error sending email!!!") raise e
def default_onfinish(): logger.log("Done sending email")