def close(self): if not log_bool(): return for writer in self.summary_writers.values(): writer.close() if log_bool(): self.pbar.exit()
def __init__(self, pbar=None, print_every=1, checkpoint_every=1, copy_checkpoint_every=None, checkpoint_folder=None, tensorboard_every=1, summary_writers=['train', 'val'], needs_graph=True, purge_step=None, email_every=None, email_sender=None): self.print_every = print_every self.iteration_info = None if not log_bool(): self.needs_graph = needs_graph return self.pbar = pbar if pbar is not None else ProgressBar() self.checkpoint_every = checkpoint_every self.copy_checkpoint_every = copy_checkpoint_every self.checkpoint_folder = checkpoint_folder if self.copy_checkpoint_every is not None\ and self.checkpoint_folder is not None: self.saved_checkpoints_folder = os.path.join( self.checkpoint_folder, "saved_checkpoints") if not os.path.exists(self.saved_checkpoints_folder): subprocess.run(["mkdir", self.saved_checkpoints_folder]) # set up tensorboard self.tensorboard_every = tensorboard_every if checkpoint_folder is None: datetime_machine = datetime.now().strftime('%b%d_%H-%M-%S')\ + '_' + socket.gethostname() self.tensorboard_folder = os.path.join('runs', datetime_machine) else: self.tensorboard_folder = os.path.join(checkpoint_folder, 'tensorboard') self.summary_writers = { k: SummaryWriter(log_dir=self.tensorboard_folder + '/' + k, purge_step=purge_step) for k in summary_writers } self.needs_graph = needs_graph # set up email self.email_every = email_every if self.email_every is not None and log_bool(): if email_sender is None: raise Exception self.email_sender = email_sender
def add_graph(self, model, batch): if not log_bool(): return keys, values = list( zip(*((k, v) for k, v in batch.get_observed().items()))) model = ModelWrapper(model, keys) for writer in self.summary_writers.values(): writer.add_graph(model, values) self.needs_graph = False
def spawn_function(email_sender): seed_state() model = Model() if torch.distributed.is_initialized(): rank = torch.distributed.get_rank() worldsize = torch.distributed.get_world_size() # model = LDDP(model.to('cpu'), worldsize) model = LDDP(model.to(rank), worldsize) tokenizer = Tokenizer( load_vocab('/home/jered/Documents/data/cnn_dataset/vocab', 50000)) postprocessor = Postprocessor() batcher = TrainSummarizationBatcher(tokenizer) val_batcher = TestSummarizationBatcher(tokenizer) val_dataset = SummarizationDataset( '/home/jered/Documents/data/cnn_dataset/preprocessed/val_processed.data' ) train_dataset = SummarizationDataset( '/home/jered/Documents/data/cnn_dataset/preprocessed/val_processed.data' ) # batch_iterator = batcher.batch_iterator(train_dataset, init_indices_iterator(len(train_dataset), batch_size=15, random=True, epochs=2), subbatches=None) batch_iterator = batcher.batch_iterator(train_dataset, init_indices_iterator( len(train_dataset), batch_size=15, random=True, iterations=200), subbatches=None) val_iterator = batcher.batch_iterator( val_dataset, init_indices_iterator(100, batch_size=15, random=True, iterations=len(batch_iterator.indices_iterator)), subbatches=None) optimizer = Adam([p for p in model.parameters()]) tracker = Tracker( print_every=10, checkpoint_folder='test', checkpoint_every=7, copy_checkpoint_every=7) #, email_every=10, email_sender=email_sender) trainer = Trainer(model, postprocessor, optimizer, batch_iterator, val_iterator=val_iterator, tracker=tracker) logger.set_verbosity(2) trainer.train() #, use_pbar=False) if log_bool(): logger.log("\n\nTESTING") val_iterator = batcher.batch_iterator(val_dataset, init_indices_iterator(100, batch_size=15), subbatches=None) tester = Tester(model, postprocessor, val_iterator) tester.test()
def register_iteration(self, iteration_info, trainer): super(Tracker, self).register_iteration(iteration_info, trainer) if log_bool(): if self.recurring_bool(iteration_info, self.expensive_val_every): self.expensive_val_func(iteration_info)
def register_iteration(self, iteration_info, trainer): self.iteration_info = iteration_info if dist.is_initialized(): collected = collect_obj_on_rank0( self.iteration_info, ranks=self.iteration_info.iterator_info.subbatches.get_ranks()) if collected is not None: self.iteration_info = sum(collected) else: self.iteration_info = None if log_bool(): if self.recurring_bool(iteration_info, self.print_every): logger.log(str(self.iteration_info)) if len(self.summary_writers) > 0 and\ self.recurring_bool(iteration_info, self.tensorboard_every): self.iteration_info.write_to_tensorboard(self.summary_writers) # save state to file if self.checkpoint_folder is not None\ and self.recurring_bool(iteration_info, self.checkpoint_every): logger.log("saving checkpoint to %s, batches_seen: %i" % (self.checkpoint_folder, iteration_info.iterator_info.batches_seen)) trainer.save_state(self.checkpoint_folder) # copy checkpoint if self.checkpoint_folder is not None\ and self.recurring_bool(iteration_info, self.copy_checkpoint_every): logger.log("copying to checkpoint number %i" % iteration_info.iterator_info.batches_seen) self.copy_checkpoint_in_thread( iteration_info.iterator_info.batches_seen) logger.log("continuing") # email if self.recurring_bool(iteration_info, self.email_every): logger.log("sending email to %s, batches_seen: %i" % (self.email_sender.receiver_email, iteration_info.iterator_info.batches_seen)) attachments = [] if len(self.summary_writers) <= 0 else\ create_tensorboard_attachment_generator( self.tensorboard_folder) onfinish = lambda: logger.log( "Done sending email at %i batches_seen" % iteration_info. iterator_info.batches_seen) error_message = \ "Error sending email at %i batches_seen!" %\ iteration_info.iterator_info.batches_seen def onerror_base(e): logger.log(error_message) raise e def onerror(e): if check_attachment_error(e): logger.log(error_message + " Trying to send without attachment") self.email_sender.send_email(str(iteration_info), onfinish=onfinish, onerror=onerror_base) else: onerror_base(e) self.email_sender(str(iteration_info), attachments=attachments, onfinish=onfinish, onerror=onerror) logger.log("continuing") # update progress bar self.pbar.update()
def enter(self, *args, **kwargs): if log_bool(): self.pbar.enter(*args, **kwargs)
def enter(self, *args, **kwargs): if log_bool(): self.pbar = self.init_pbar(*args, **kwargs) logger.add_progress_bar(tqdm)
def exit(self): if log_bool(): self.pbar.close() self.pbar = None logger.remove_progress_bar()