Exemple #1
0
 def onerror(e):
     logger.log("Error sending email")
     if check_attachment_error(e):
         logger.log("Trying to send without attachment")
         es.send_email("email2")
     else:
         raise e
Exemple #2
0
 def iteration_trainstep(self, iteration_info, grad_mod=None):
     train_info = 0
     # iterate through all the subbatches in a batch, accumulating gradients
     while True:
         # process training subbatch
         loss, output_batch = self.process_batch(
             next(self.train_iterator), enable_grad=True)
         # get iterator_info from iterator
         iterator_info = self.train_iterator.iterator_info()
         # calculate and accumulate gradients
         self.calculate_grads(loss)
         # accumulate batch info
         train_info += output_batch
         # log subbatch info
         if ((iterator_info.batches_seen
              + int(not self.train_iterator.take_step()))
             % self.tracker.print_every) == 0:
             logger.log(indent(iterator_info.subbatch_str(),
                               "        "), verbosity=2)
         # end loop if the iterator says to take a gradient step
         if self.train_iterator.take_step():
             break
     # record training info
     iteration_info.set_train_info(train_info)
     # take a gradient step, with loss summed over all subbatches on all
     # devices, dividing by the number of instances
     self.step(grad_mod=grad_mod,
               denominator=train_info.batch_length)
Exemple #3
0
 def onerror(e):
     if check_attachment_error(e):
         logger.log(error_message +
                    " Trying to send without attachment")
         self.email_sender.send_email(str(iteration_info),
                                      onfinish=onfinish,
                                      onerror=onerror_base)
     else:
         onerror_base(e)
Exemple #4
0
def spawn_function(email_sender):
    seed_state()
    model = Model()
    if torch.distributed.is_initialized():
        rank = torch.distributed.get_rank()
        worldsize = torch.distributed.get_world_size()
        #        model = LDDP(model.to('cpu'), worldsize)
        model = LDDP(model.to(rank), worldsize)
    tokenizer = Tokenizer(
        load_vocab('/home/jered/Documents/data/cnn_dataset/vocab', 50000))
    postprocessor = Postprocessor()
    batcher = TrainSummarizationBatcher(tokenizer)
    val_batcher = TestSummarizationBatcher(tokenizer)
    val_dataset = SummarizationDataset(
        '/home/jered/Documents/data/cnn_dataset/preprocessed/val_processed.data'
    )
    train_dataset = SummarizationDataset(
        '/home/jered/Documents/data/cnn_dataset/preprocessed/val_processed.data'
    )
    #    batch_iterator = batcher.batch_iterator(train_dataset, init_indices_iterator(len(train_dataset), batch_size=15, random=True, epochs=2), subbatches=None)
    batch_iterator = batcher.batch_iterator(train_dataset,
                                            init_indices_iterator(
                                                len(train_dataset),
                                                batch_size=15,
                                                random=True,
                                                iterations=200),
                                            subbatches=None)
    val_iterator = batcher.batch_iterator(
        val_dataset,
        init_indices_iterator(100,
                              batch_size=15,
                              random=True,
                              iterations=len(batch_iterator.indices_iterator)),
        subbatches=None)
    optimizer = Adam([p for p in model.parameters()])
    tracker = Tracker(
        print_every=10,
        checkpoint_folder='test',
        checkpoint_every=7,
        copy_checkpoint_every=7)  #, email_every=10, email_sender=email_sender)
    trainer = Trainer(model,
                      postprocessor,
                      optimizer,
                      batch_iterator,
                      val_iterator=val_iterator,
                      tracker=tracker)
    logger.set_verbosity(2)
    trainer.train()  #, use_pbar=False)
    if log_bool():
        logger.log("\n\nTESTING")
    val_iterator = batcher.batch_iterator(val_dataset,
                                          init_indices_iterator(100,
                                                                batch_size=15),
                                          subbatches=None)
    tester = Tester(model, postprocessor, val_iterator)
    tester.test()
Exemple #5
0
def copy_checkpoint(checkpoint,
                    checkpoint_num,
                    onfinish=lambda: logger.log("done copying checkpoint")):
    stuff = set(os.listdir(checkpoint))
    stuff.remove('saved_checkpoints')
    saved_checkpoint = os.path.join(checkpoint, 'saved_checkpoints',
                                    'checkpoint%i' % checkpoint_num)
    subprocess.run(["mkdir", saved_checkpoint])
    for x in stuff:
        subprocess.run(
            ["cp", "-r",
             os.path.join(checkpoint, x), saved_checkpoint])
    onfinish()
Exemple #6
0
 def __call__(self, iteration_info):
     logger.log("Running Supervised Testing")
     self.model.load_state_dict(self.training_model.state_dict())
     results_folder = os.path.join(
         self.results_folder,
         'results_%s' % iteration_info.iterator_info.batches_seen)
     os.mkdir(results_folder)
     self.postprocessor.add_output_dir(results_folder)
     val_indices_iterator = init_indices_iterator(len(self.val_dataset),
                                                  self.batch_size)
     val_iterator = self.batcher.batch_iterator(
         self.val_dataset,
         val_indices_iterator,
         subbatches=self.subbatches,
         num_workers=self.num_workers)
     tester = Tester(self.model, self.postprocessor, val_iterator)
     total_output_batch = tester.test()
     with open(os.path.join(results_folder, 'scores.txt'), 'w') as f:
         f.write(str(total_output_batch))
     if self.email_sender is not None:
         attachments = self.postprocessor.get_summary_attachment_generator()
         self.email_sender("Testing is done!\n\n" + str(total_output_batch),
                           attachments=attachments)
     logger.log("Testing is done!")
Exemple #7
0
    def register_iteration(self, iteration_info, trainer):
        self.iteration_info = iteration_info
        if dist.is_initialized():
            collected = collect_obj_on_rank0(
                self.iteration_info,
                ranks=self.iteration_info.iterator_info.subbatches.get_ranks())
            if collected is not None:
                self.iteration_info = sum(collected)
            else:
                self.iteration_info = None
        if log_bool():
            if self.recurring_bool(iteration_info, self.print_every):
                logger.log(str(self.iteration_info))
            if len(self.summary_writers) > 0 and\
               self.recurring_bool(iteration_info, self.tensorboard_every):
                self.iteration_info.write_to_tensorboard(self.summary_writers)
            # save state to file
            if self.checkpoint_folder is not None\
               and self.recurring_bool(iteration_info, self.checkpoint_every):
                logger.log("saving checkpoint to %s, batches_seen: %i" %
                           (self.checkpoint_folder,
                            iteration_info.iterator_info.batches_seen))
                trainer.save_state(self.checkpoint_folder)
            # copy checkpoint
            if self.checkpoint_folder is not None\
               and self.recurring_bool(iteration_info, self.copy_checkpoint_every):
                logger.log("copying to checkpoint number %i" %
                           iteration_info.iterator_info.batches_seen)
                self.copy_checkpoint_in_thread(
                    iteration_info.iterator_info.batches_seen)
                logger.log("continuing")
            # email
            if self.recurring_bool(iteration_info, self.email_every):
                logger.log("sending email to %s, batches_seen: %i" %
                           (self.email_sender.receiver_email,
                            iteration_info.iterator_info.batches_seen))
                attachments = [] if len(self.summary_writers) <= 0 else\
                              create_tensorboard_attachment_generator(
                                  self.tensorboard_folder)
                onfinish = lambda: logger.log(
                    "Done sending email at %i batches_seen" % iteration_info.
                    iterator_info.batches_seen)
                error_message = \
                    "Error sending email at %i batches_seen!" %\
                    iteration_info.iterator_info.batches_seen

                def onerror_base(e):
                    logger.log(error_message)
                    raise e

                def onerror(e):
                    if check_attachment_error(e):
                        logger.log(error_message +
                                   " Trying to send without attachment")
                        self.email_sender.send_email(str(iteration_info),
                                                     onfinish=onfinish,
                                                     onerror=onerror_base)
                    else:
                        onerror_base(e)

                self.email_sender(str(iteration_info),
                                  attachments=attachments,
                                  onfinish=onfinish,
                                  onerror=onerror)
                logger.log("continuing")
            # update progress bar
            self.pbar.update()
Exemple #8
0
 def copy_checkpoint_in_thread(self, batches_seen):
     onfinish = lambda: logger.log("done copying to checkpoint number %i" %
                                   batches_seen)
     thread = Thread(target=copy_checkpoint,
                     args=[self.checkpoint_folder, batches_seen, onfinish])
     thread.start()
Exemple #9
0
 def onerror_base(e):
     logger.log(error_message)
     raise e
Exemple #10
0
 def log_section(self, i, dynamic_parameters):
     logger.log("Section %i" % i)
     logger.log("\tDynamic" + str(dynamic_parameters))
 def onerror(e):
     if check_attachment_error(e):
         logger.log("Trying to send without attachment")
         email_sender.send_email(str(total_output_batch))
     else:
         default_onerror(e)
Exemple #12
0
def default_onerror(e):
    logger.log("Error sending email!!!")
    raise e
Exemple #13
0
def default_onfinish():
    logger.log("Done sending email")