def __validate_one_epoch(self, valid_bundle_list, valid_dt_list, valid_tasks, weighted_instance_loss): stopping_valid_task = None if valid_bundle_list is not None: self.bert_classifier.eval() [cur_task[1].reset() for cur_task in valid_tasks.items()] for dt_ind, cur_dt in enumerate(valid_dt_list): batches = self.generate_batches([cur_dt], self.config, False, False, self.current_train_epoch, EInputListMode.sequential) for ba_ind, cur_batch in enumerate(batches): outcome = self.bert_classifier(cur_batch, False) self.__process_loss(outcome, cur_batch, valid_tasks, False, weighted_instance_loss) self.delete_batch_from_gpu(cur_batch, EInputListMode.sequential) del cur_batch, outcome for cur_task in valid_tasks.items(): cur_task[1].loss /= cur_task[1].size cur_task[1].f1 = ELib.calculate_f1(cur_task[1].lbl_true, cur_task[1].lbl_pred) ################ checks early stopping only if the model does not have hooks ## deepcopy() cannot copy hooks! fix it later... if self.config.check_early_stopping and len(self.bert_classifier.logs) == 0: for cur_task in valid_tasks.items(): if cur_task[1].learning_state.should_stop( cur_task[1].loss, self.bert_classifier, self.config.device): self.bert_classifier.cpu() self.bert_classifier = cur_task[1].learning_state.best_model stopping_valid_task = cur_task break return stopping_valid_task
def __train_one_epoch(self, train_dt_list, train_tasks, input_mode, weighted_instance_loss, report_number_of_intervals, train_shuffle, train_drop_last, balance_batch_mode_list): batches = self.generate_batches(train_dt_list, self.config, train_shuffle, train_drop_last, self.current_train_epoch, input_mode, balance_batch_mode_list) [cur_task[1].reset() for cur_task in train_tasks.items()] for ba_ind, cur_batch in enumerate(batches): self.bert_classifier.train_step += 1 # to track the overall number inside the classifier while True: outcome = self.bert_classifier(cur_batch, False) self.__process_loss(outcome, cur_batch, train_tasks, True, weighted_instance_loss) if not self.delay_optimizer: break if ELib.progress_made(ba_ind, cur_batch['batch_count'], report_number_of_intervals): print(ELib.progress_percent(ba_ind, cur_batch['batch_count']), end=' ', flush=True) self.delete_batch_from_gpu(cur_batch, input_mode) del cur_batch, outcome ## in case there are multiple models and their losses are heavy (in terms of memory) ## you can call 'self.sync_obj.lock_loss_calculation.acquire()' in 'self.custom_train_loss_func()' ## This way the losses are calculated one by one and after that the models are re-synched if self.sync_obj is not None and self.sync_obj.lock_loss_calculation.locked(): ## wait for the other models to arrive if self.sync_obj.sync_counter == self.sync_obj.model_count: self.sync_obj.reset() self.sync_obj.sync_counter += 1 self.sync_obj.lock_loss_calculation.release() while self.sync_obj.sync_counter < self.sync_obj.model_count: self.sleep() # pprint(vars(self)) # ELib.PASS() ## if there are multiple models avoid double printing the newline if self.sync_obj is None: print() elif self.model_id == 0: print() ## calculate the metric averages in the epoch for cur_task in train_tasks.items(): if cur_task[1].size > 0: cur_task[1].loss /= cur_task[1].size cur_task[1].f1 = ELib.calculate_f1(cur_task[1].lbl_true, cur_task[1].lbl_pred) ELib.PASS()