Example #1
0
 def close(self):
     if not log_bool():
         return
     for writer in self.summary_writers.values():
         writer.close()
     if log_bool():
         self.pbar.exit()
Example #2
0
 def __init__(self,
              pbar=None,
              print_every=1,
              checkpoint_every=1,
              copy_checkpoint_every=None,
              checkpoint_folder=None,
              tensorboard_every=1,
              summary_writers=['train', 'val'],
              needs_graph=True,
              purge_step=None,
              email_every=None,
              email_sender=None):
     self.print_every = print_every
     self.iteration_info = None
     if not log_bool():
         self.needs_graph = needs_graph
         return
     self.pbar = pbar if pbar is not None else ProgressBar()
     self.checkpoint_every = checkpoint_every
     self.copy_checkpoint_every = copy_checkpoint_every
     self.checkpoint_folder = checkpoint_folder
     if self.copy_checkpoint_every is not None\
        and self.checkpoint_folder is not None:
         self.saved_checkpoints_folder = os.path.join(
             self.checkpoint_folder, "saved_checkpoints")
         if not os.path.exists(self.saved_checkpoints_folder):
             subprocess.run(["mkdir", self.saved_checkpoints_folder])
     # set up tensorboard
     self.tensorboard_every = tensorboard_every
     if checkpoint_folder is None:
         datetime_machine = datetime.now().strftime('%b%d_%H-%M-%S')\
                            + '_' + socket.gethostname()
         self.tensorboard_folder = os.path.join('runs', datetime_machine)
     else:
         self.tensorboard_folder = os.path.join(checkpoint_folder,
                                                'tensorboard')
     self.summary_writers = {
         k: SummaryWriter(log_dir=self.tensorboard_folder + '/' + k,
                          purge_step=purge_step)
         for k in summary_writers
     }
     self.needs_graph = needs_graph
     # set up email
     self.email_every = email_every
     if self.email_every is not None and log_bool():
         if email_sender is None:
             raise Exception
         self.email_sender = email_sender
Example #3
0
 def add_graph(self, model, batch):
     if not log_bool():
         return
     keys, values = list(
         zip(*((k, v) for k, v in batch.get_observed().items())))
     model = ModelWrapper(model, keys)
     for writer in self.summary_writers.values():
         writer.add_graph(model, values)
     self.needs_graph = False
Example #4
0
def spawn_function(email_sender):
    seed_state()
    model = Model()
    if torch.distributed.is_initialized():
        rank = torch.distributed.get_rank()
        worldsize = torch.distributed.get_world_size()
        #        model = LDDP(model.to('cpu'), worldsize)
        model = LDDP(model.to(rank), worldsize)
    tokenizer = Tokenizer(
        load_vocab('/home/jered/Documents/data/cnn_dataset/vocab', 50000))
    postprocessor = Postprocessor()
    batcher = TrainSummarizationBatcher(tokenizer)
    val_batcher = TestSummarizationBatcher(tokenizer)
    val_dataset = SummarizationDataset(
        '/home/jered/Documents/data/cnn_dataset/preprocessed/val_processed.data'
    )
    train_dataset = SummarizationDataset(
        '/home/jered/Documents/data/cnn_dataset/preprocessed/val_processed.data'
    )
    #    batch_iterator = batcher.batch_iterator(train_dataset, init_indices_iterator(len(train_dataset), batch_size=15, random=True, epochs=2), subbatches=None)
    batch_iterator = batcher.batch_iterator(train_dataset,
                                            init_indices_iterator(
                                                len(train_dataset),
                                                batch_size=15,
                                                random=True,
                                                iterations=200),
                                            subbatches=None)
    val_iterator = batcher.batch_iterator(
        val_dataset,
        init_indices_iterator(100,
                              batch_size=15,
                              random=True,
                              iterations=len(batch_iterator.indices_iterator)),
        subbatches=None)
    optimizer = Adam([p for p in model.parameters()])
    tracker = Tracker(
        print_every=10,
        checkpoint_folder='test',
        checkpoint_every=7,
        copy_checkpoint_every=7)  #, email_every=10, email_sender=email_sender)
    trainer = Trainer(model,
                      postprocessor,
                      optimizer,
                      batch_iterator,
                      val_iterator=val_iterator,
                      tracker=tracker)
    logger.set_verbosity(2)
    trainer.train()  #, use_pbar=False)
    if log_bool():
        logger.log("\n\nTESTING")
    val_iterator = batcher.batch_iterator(val_dataset,
                                          init_indices_iterator(100,
                                                                batch_size=15),
                                          subbatches=None)
    tester = Tester(model, postprocessor, val_iterator)
    tester.test()
 def register_iteration(self, iteration_info, trainer):
     super(Tracker, self).register_iteration(iteration_info, trainer)
     if log_bool():
         if self.recurring_bool(iteration_info, self.expensive_val_every):
             self.expensive_val_func(iteration_info)
Example #6
0
    def register_iteration(self, iteration_info, trainer):
        self.iteration_info = iteration_info
        if dist.is_initialized():
            collected = collect_obj_on_rank0(
                self.iteration_info,
                ranks=self.iteration_info.iterator_info.subbatches.get_ranks())
            if collected is not None:
                self.iteration_info = sum(collected)
            else:
                self.iteration_info = None
        if log_bool():
            if self.recurring_bool(iteration_info, self.print_every):
                logger.log(str(self.iteration_info))
            if len(self.summary_writers) > 0 and\
               self.recurring_bool(iteration_info, self.tensorboard_every):
                self.iteration_info.write_to_tensorboard(self.summary_writers)
            # save state to file
            if self.checkpoint_folder is not None\
               and self.recurring_bool(iteration_info, self.checkpoint_every):
                logger.log("saving checkpoint to %s, batches_seen: %i" %
                           (self.checkpoint_folder,
                            iteration_info.iterator_info.batches_seen))
                trainer.save_state(self.checkpoint_folder)
            # copy checkpoint
            if self.checkpoint_folder is not None\
               and self.recurring_bool(iteration_info, self.copy_checkpoint_every):
                logger.log("copying to checkpoint number %i" %
                           iteration_info.iterator_info.batches_seen)
                self.copy_checkpoint_in_thread(
                    iteration_info.iterator_info.batches_seen)
                logger.log("continuing")
            # email
            if self.recurring_bool(iteration_info, self.email_every):
                logger.log("sending email to %s, batches_seen: %i" %
                           (self.email_sender.receiver_email,
                            iteration_info.iterator_info.batches_seen))
                attachments = [] if len(self.summary_writers) <= 0 else\
                              create_tensorboard_attachment_generator(
                                  self.tensorboard_folder)
                onfinish = lambda: logger.log(
                    "Done sending email at %i batches_seen" % iteration_info.
                    iterator_info.batches_seen)
                error_message = \
                    "Error sending email at %i batches_seen!" %\
                    iteration_info.iterator_info.batches_seen

                def onerror_base(e):
                    logger.log(error_message)
                    raise e

                def onerror(e):
                    if check_attachment_error(e):
                        logger.log(error_message +
                                   " Trying to send without attachment")
                        self.email_sender.send_email(str(iteration_info),
                                                     onfinish=onfinish,
                                                     onerror=onerror_base)
                    else:
                        onerror_base(e)

                self.email_sender(str(iteration_info),
                                  attachments=attachments,
                                  onfinish=onfinish,
                                  onerror=onerror)
                logger.log("continuing")
            # update progress bar
            self.pbar.update()
Example #7
0
 def enter(self, *args, **kwargs):
     if log_bool():
         self.pbar.enter(*args, **kwargs)
Example #8
0
 def enter(self, *args, **kwargs):
     if log_bool():
         self.pbar = self.init_pbar(*args, **kwargs)
     logger.add_progress_bar(tqdm)
Example #9
0
 def exit(self):
     if log_bool():
         self.pbar.close()
         self.pbar = None
     logger.remove_progress_bar()