def predict(self, data: list, decode: bool = True) -> list: """Given raw data (unprocessed), run prediction pipeline and return predictions Args: data: Data raw. Following the expected format fof the task decode: If model output needs to be decoded or not, default: True Returns: res: Result of running the pipeline. """ assert len(data) > 1 data_as_batches = self.processor(data, list_input=True, as_batches=True) # res = [] # for i, batch in enumerate(data_as_batches): # #res.extend(self.model(U.move_batch(batch, self.device))) # res.extend(self.processor._from_batch(self.model(U.move_batch(batch, self.device)))) res = self.processor._from_batches( [self.model(U.move_batch(batch, self.device)) for batch in data_as_batches] ) # print(res['inp']) # print(res['label']) # print(res['pred']) if decode: res = self.processor.decode(res, list_input=True) return res
def _evaluate_batches(self, data_as_batches): self.evaluator.reset() val_loss = None for j, b in enumerate(data_as_batches): # self.logger.debug(b) res = self.model(U.move_batch(b, self.device)) val_loss = self.evaluator.update_batch_loss(res) if val_loss is None: raise ValueError( "Iteration over batches did not happen.\ Probably becuase batch_size is larger than the split and drop last is true" ) return val_loss
def train_model(self, data: list = None) -> dict: """Train the model on the data. Args: data: the data to be trained on. If not provided, default training split will be used Returns: resulting config file after training """ def log( log_time, total_time, epoch, iteration, global_step, total_train_loss, processed_examples, prefix="", ): val_loss = None self.logger.debug( "Gradient Norms of last training batch (after clipping if clipping is applied)" ) if not val_batches: self.logger.info( prefix + "Epoch: {}: {}/{} Train duration(s): {:.1f}\ \t Train loss (~avg over batches): {:.4f} | {:.1f) Ex. per sec".format( epoch, iteration * self.batch_size, len(train_batches) * self.batch_size, total_time, total_train_loss, processed_examples / total_time, ) ) results["during_training"][str(i)] = { "train_loss": float(total_train_loss) } self.model.eval() # Predict sample on train # train_res = self.predict(self.processor.get_data(raw=True)[0][:self.sample_limit]) train_res = self.predict( self.processor.get_sample( indx=0, size=self.sample_limit, split=0, raw=True ) ) self.processor.save_split( train_res, f"train_prediction_{epoch}_{iteration}" ) self.model.save() else: self.logger.debug(f"Length of val_batches: {len(val_batches)}") self.model.eval() # Predict sample on train # train_res = self.predict(self.processor.get_data(raw=True)[0][:self.sample_limit]) train_res = self.predict( self.processor.get_sample( indx=0, size=self.sample_limit, split=0, raw=True ) ) self.processor.save_split( train_res, f"train_prediction_{epoch}_{iteration}" ) # Predict sample on dev # dev_res = self.predict(self.processor.get_data(raw=True)[1][:self.sample_limit]) dev_res = self.predict( self.processor.get_sample( indx=0, size=self.sample_limit, split=1, raw=True ) ) self.processor.save_split( dev_res, f"dev_prediction_{epoch}_{iteration}" ) # Evaluate val_loss = self._evaluate_batches(val_batches) # train_total_loss = self._evaluate_batches(train_batches) # self.logger.info(f"TRAIN LOSS as batches: {train_total_loss}") val_loss_str = ",".join( "{:.4f}".format(v) for v in val_loss ) # Need to move to evaluator results["during_training"][str(i)] = { "train_loss": float(total_train_loss), "val_loss": val_loss_str, } self.log_tf(val_loss, global_step, "_validation") saved_model = "" if self.evaluator.isbetter(val_loss, self.best_loss, is_metric=False): self.best_loss = val_loss # Found best model, save self.model.save() results["best"] = results["during_training"][str(i)] saved_model = f" Best model saved in: {self.model.model_path}" # self.logger.info("Best model saved at: {}".format(config['out_path'])) self.logger.info( prefix + "Epoch: {}: {}/{} Train duration(s): {:.1f} \ \t Train loss (~avg over batches): \ {:.4f}, validation loss: {} | {:.1f} Ex. per sec | {}".format( epoch, iteration * self.batch_size, len(train_batches) * self.batch_size, total_time, total_train_loss, val_loss_str, processed_examples / total_time, saved_model, ) ) # self.logger.info("") # Save config with open(self.out_path, "w") as f: f.write(U.safe_serialize(config)) # Generate and process data config = self.config config_pretty = pprint.pformat(config) self.logger.info("Configurations: " + "-" * 50) self.logger.info(config_pretty) self.logger.info("--" * 50) # train_raw = None val_batches = None test_batches = None if not data: data = self.processor.get_data() # train_raw = self.processor.get_data(raw=True)[0] train_batches = data[0] self.epoch_size = len(train_batches) * self.batch_size self.total_size = self.epochs * self.epoch_size if len(data) > 1: val_batches = data[1] # val_batches_raw = self.processor.get_data(raw=True)[1] if len(data) > 2: test_batches = data[2] self.logger.info(f"Sample Raw example from train: ") self.logger.info(self.processor.get_sample(0)) self.logger.info(f"After Encoding:") sample_ = self.processor.encode(self.processor.get_sample(0)) self.logger.info(sample_) self.logger.info(f"After decoding:") sample_ = self.processor.decode(sample_) self.logger.info(sample_) self.logger.info("--" * 50) # qz1 Start training self.best_loss = self.evaluator.get_worst_loss() results = config["results"] self.logger.info( f"Training Size: {len(train_batches) * self.batch_size} examples." ) # self.writer.add_graph(self.model) self.logger.info("==" * 50) self.logger.info(" " * 20 + "Starting of Training") self.logger.info("==" * 50) # Training Epochs for i in range(self.current_epoch, self.current_epoch + self.epochs + 1): self.model.train() # Set model to train mode total_train_loss = 0 total_train_batches = 0 processed_examples = 0 # current_epoch = i # needed to track evaluation? start = datetime.datetime.now() iteration_beg = datetime.datetime.now() for b_num, b in enumerate(train_batches): # self.logger.debug(b) total_train_batches += 1 self.model.zero_grad() preds = self.model(U.move_batch(b, self.device)) tloss = self.evaluator(preds) total_train_loss += tloss tloss.backward() # gradients = U.get_gradients(self.model) # self.logger.debug("Gradients AVE") # self.logger.debug(gradients) if self.clip: torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.clip, norm_type=2 ) self.optimizer.step() global_step = (i * self.epoch_size) + b_num * self.batch_size self.log_tf(tloss, global_step, "_train") processed_examples += self.batch_size if b_num * self.batch_size % config["log_interval"] == 0: iteration_done = datetime.datetime.now() iter_total_time = (iteration_done - iteration_beg).total_seconds() self.model.eval() log( datetime.datetime.now().strftime("%m-%d %H:%M"), iter_total_time, i, b_num + 1, global_step, total_train_loss / total_train_batches, processed_examples, ) iteration_beg = datetime.datetime.now() self.model.train() # Set model to train mode processed_examples = 0 done = datetime.datetime.now() epoch_time = (done - start).total_seconds() # Need to dynamically decide if avg of loss is needed or sum # This will break when loss is changed between mean / sum. # Might be slightly off if last batch is not dropped and has size != to batch_size total_train_loss /= total_train_batches self.logger.info("-" * 50) self.model.eval() log( done.strftime("%m-%d %H:%M"), epoch_time, i, b_num + 1, global_step, total_train_loss, prefix="END OF EPOCH | ", ) self.logger.info("-" * 50) self.model.train() self.logger.info("==" * 50) self.logger.info(" " * 20 + "End of Training") self.logger.info("==" * 50) if test_batches is not None: self.model.eval() # Set model to train mode test_val = self._evaluate_batches(test_batches) test_loss_str = ",".join( str(v) for v in test_val ) # Need to move to evaluator results["test"] = {test_loss_str} self.logger.info("Test metrics: {}".format(test_loss_str)) # Save config with open(self.out_path, "w") as f: f.write(U.safe_serialize(config)) config["results"] = results return config