def test(model, dataloader, params): val_data = tqdm(dataloader.data_iterator(data_type='test', batch_size=params.batch_size), total=(dataloader.size()[0] // params.batch_size)) metrics = Metrics() loss_avg = RunningAverage() with torch.no_grad(): for data, labels in val_data: model.eval() data = torch.tensor(data, dtype=torch.long).to(params.device) labels = torch.tensor(labels, dtype=torch.long).to(params.device) batch_masks = data != 0 loss, logits = model(data, attention_mask=batch_masks, labels=labels) predicted = logits.max(2)[1] metrics.update(batch_pred=predicted.cpu().numpy(), batch_true=labels.cpu().numpy(), batch_mask=batch_masks.cpu().numpy()) loss_avg.update(torch.mean(loss).item()) val_data.set_postfix(type='VAL', loss='{:05.3f}'.format(loss_avg())) metrics.loss = loss_avg() return metrics
def run_train(self, dataset, args): model, tokenizer = self.bert, self.tokenizer batch_size = args.batch_size model.train() train_examples = dataset.train_dataloader # Initialize Optimizer num_train_iters = args.epochs * len( train_examples) / batch_size / args.gradient_accumulation_steps self.init_optimizer(args, num_train_iters) train_avg_loss = RunningAverage() for epoch in range(args.epochs): print('Epoch {}'.format(epoch)) train_bar = tqdm( enumerate(train_examples), total=len(train_examples), desc="Training", ) for step, batch in train_bar: inputs = {k: v.to('cuda') for k, v in batch.items()} loss = model(inputs['input_ids'], inputs['token_type_ids'], labels=inputs['labels']) if args.n_gpus > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() train_avg_loss.update(loss.item())
def validate(model, val_set, params): val_data = tqdm(DataLoader(val_set, batch_size=params.batch_size, collate_fn=KeyphraseData.collate_fn), total=(len(val_set) // params.batch_size)) metrics = Metrics() loss_avg = RunningAverage() with torch.no_grad(): model.eval() for data, labels, mask in val_data: data = data.to(params.device) labels = labels.to(params.device) mask = mask.to(params.device) loss, logits = model(data, attention_mask=mask, labels=labels) predicted = logits.max(2)[1] metrics.update(batch_pred=predicted.cpu().numpy(), batch_true=labels.cpu().numpy(), batch_mask=mask.cpu().numpy()) loss_avg.update(torch.mean(loss).item()) val_data.set_postfix(type='VAL', loss='{:05.3f}'.format(loss_avg())) metrics.loss = loss_avg() return metrics
def train(model, dataloader, optimizer, scheduler, params): print("Starting training...") best_val_loss = 100 #print(params.save_dir, params.tag) stats = Stats(params.save_dir, params.tag) for epoch in range(params.epoch_num): loss_avg = RunningAverage() train_data = tqdm(dataloader.data_iterator(data_type='train', batch_size=params.batch_size), total=(dataloader.size()[0] // params.batch_size)) optimizer.zero_grad() model.zero_grad() for data, labels in train_data: model.train() data = torch.tensor(data, dtype=torch.long).to(params.device) labels = torch.tensor(labels, dtype=torch.long).to(params.device) batch_masks = (data != 0) output = model(data, attention_mask=batch_masks, labels=labels) loss = torch.mean(output[0]) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), params.max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue) optimizer.step() scheduler.step() model.zero_grad() optimizer.zero_grad() # update the average loss loss_avg.update(loss.item()) train_data.set_postfix(type='TRAIN',epoch=epoch,loss='{:05.3f}'.format(loss_avg())) metrics = validate(model, dataloader, params) print('After {} epochs: F1={}, Loss={}'.format(epoch , metrics.f1(), metrics.loss)) stats.update(metrics, epoch, loss_avg()) stats.save() if epoch % params.save_freq == 0 and params.save_checkpoints: save_checkpoint({'epoch': epoch, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=False, tag=params.tag, epoch=epoch, score=metrics.f1(), checkpoint=params.save_dir) if metrics.loss < best_val_loss: best_val_loss = metrics.loss save_checkpoint({'epoch': epoch, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=True, tag=params.tag, epoch='generic', score='epic', checkpoint=params.save_dir)
def one_epoch(self, mode, epoch_num): if mode not in ['train', 'test']: raise ValueError("Unknown value {} for mode".format(mode)) print("{}ing... epoch: {}".format(mode, epoch_num)) if mode == 'train': self.model.train() dl = self.train_data one_iter_function = self.one_train_iteration else: self.model.eval() dl = self.test_data one_iter_function = self.one_test_iteration acc_avg = RunningAverage() loss_avg = RunningAverage() with tqdm(total=len(dl)) as t: for n, (data, label) in enumerate(dl): if self.train_params['use_gpu']: data, label = data.cuda( self.train_params['gpu_id']), label.cuda( self.train_params['gpu_id']) data, label = Variable(data), Variable(label) data = data.float() loss, acc = one_iter_function(data, label) loss_avg.update(loss) acc_avg.update(acc) t.set_postfix( run_param="Epoch{} Loss:{:.2f} Acc:{:.2f}".format( epoch_num, loss_avg(), acc_avg())) t.update() return acc_avg, loss_avg
def evaluate(): """Calculates loss and prediction accuracy given torch dataloader""" # Turn on evaluation mode which disables dropout. md.eval() avg_loss = RunningAverage() avg_acc = RunningAverage() with torch.no_grad(): pbar = tqdm(test_dl, ascii=True, leave=False) for batch in pbar: # run model inp, target = batch inp, target = inp.to(device), target.to(device) out = md(inp.t()) # calculate loss loss = criterion(out.view(-1), target.float()) avg_loss.update(loss.item()) # calculate accuracy pred = out.view(-1) > 0.5 correct = pred == target.byte() avg_acc.update(torch.sum(correct).item() / len(correct)) pbar.set_postfix(loss=f'{avg_loss():05.3f}', acc=f'{avg_acc():05.2f}') return avg_loss(), avg_acc()
def val(dataset, model, args, mode): model.eval() loader = DataLoader(dataset, batch_size=args.batch_size) dataloader_iter = iter(loader) state_h, state_c = model.init_state(args.sequence_length) loss_avg = RunningAverage() acc_avg = RunningAverage() while True: try: X, y = next(dataloader_iter) except RuntimeError: continue except StopIteration: break y_pred, (state_h, state_c) = model(X.to(device), (state_h.to(device), state_c.to(device))) loss = criterion(y_pred.transpose(1, 2), y.long().to(device)) loss_avg.update(loss.item()) acc = accuracy(y_pred.transpose(1, 2), y.long().to(device)) acc_avg.update(acc) print({ 'epoch': epoch, 'val_loss': '{:05.4f}'.format(loss_avg()), 'accuracy': '{:05.3f}'.format(acc_avg()) })
def train(): # Turn on training mode which enables dropout. md.train() avg_loss = RunningAverage() avg_acc = RunningAverage() avg_prec = RunningAverage() avg_recall = RunningAverage() sparsity = 0.0 info = { 'loss': None, 'acc': None, } pbar = tqdm(train_dl, ascii=True, leave=False) for batch in pbar: inp, target = batch inp, target = inp.to(device), target.to(device) # run model md.zero_grad() out = md(inp.t()) loss = criterion(out.view(-1), target.float()) loss.backward() torch.nn.utils.clip_grad_norm_(md.parameters(), args.clip) optimizer.step() if args.prune: pruner.step() # upgrade stats avg_loss.update(loss.item()) pred = out.view(-1) > 0.5 correct = pred == target.byte() avg_acc.update(torch.sum(correct).item() / len(correct)) # avg_prec.update(t_p/(t_p+f_p)) # avg_recall.update(t_p/(t_p+f_n)) info['loss'] = f'{avg_loss():05.3f}' info['acc'] = f'{avg_acc():05.2f}' # info['prec'] = f'{avg_prec():05.2f}' # info['recall'] = f'{avg_recall():05.2f}' if args.prune: sparsity = pruner.log() info['spar'] = f'{sparsity:.2f}' pbar.set_postfix(**info) return avg_loss(), avg_acc(), sparsity
def train_one_epoch(model, datagen, loss_fn, optimizer): model.train() loss_avg = RunningAverage() with tqdm(total=len(datagen)) as t: for imgsA, imgsB, labels in datagen: imgsA, imgsB, labels = imgsA.to(DEVICE), imgsB.to( DEVICE), labels.to(DEVICE) optimizer.zero_grad() out = model(imgsA, imgsB) loss = loss_fn(out, labels) loss.backward() optimizer.step() t.set_postfix(loss=loss.cpu().item()) t.update() loss_avg.update(loss.cpu().item()) return loss_avg()
def evaluate(): """Calculates loss and prediction accuracy given torch dataloader""" # Turn on evaluation mode which disables dropout. md.eval() avg_loss = RunningAverage() avg_acc = RunningAverage() T_P = 0 F_P = 0 T_N = 0 F_N = 0 F__P = 0 with torch.no_grad(): pbar = tqdm(test_dl, ascii=True, leave=False) for batch in pbar: # run model inp, target = batch inp, target = inp.to(device), target.to(device) out = md(inp.t()) # calculate loss loss = criterion(out.view(-1), target.float()) avg_loss.update(loss.item()) # calculate accuracy pred = out.view(-1) > 0.5 correct = pred == target.byte() t_p, f_p, t_n, f_n, f__p = confusion(pred, target.byte()) T_P += t_p F_P += f_p T_N += t_n F_N += f_n F__P += f__p avg_acc.update(torch.sum(correct).item() / len(correct)) pbar.set_postfix(loss=f'{avg_loss():05.3f}', acc=f'{avg_acc():05.2f}') # print('False_Positive',F_P) # print('True_Positive',T_P) # print('False_Neg',F_N) # print('True_Neg',T_N) # print('Check',F__P) if (T_P == 0): avg_prec = 0.0 else: avg_prec = T_P / (T_P + F_P) if (T_P == 0): avg_recall = 0.0 else: avg_recall = T_P / (T_P + F_N) if (avg_prec + avg_recall == 0.0): f1_score = 0.0 else: f1_score = 2 * (avg_prec * avg_recall) / (avg_prec + avg_recall) return avg_loss(), avg_acc(), avg_prec, avg_recall, f1_score
def main(): args = get_args() wandb.init() wandb.config.update(args) seed = 42 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.deterministic = True torch.backends.cudnn.benchmark = False loaded_model = False [train_loader, valid_loader, model, optimizer] = initialize(args, loaded_model) scaler = torch.cuda.amp.GradScaler() wandb.watch(model) best_acc = 0 run_avg = RunningAverage() # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.001, max_lr=0.1, cycle_momentum=False) for epoch in range(1, args.epochs_number + 1): run_avg.reset_train() run_avg.reset_val() train(args, model, train_loader, epoch, optimizer, scaler, run_avg) val_acc = evaluation(args, model, valid_loader, epoch, run_avg) # scheduler.step() if best_acc < val_acc: best_acc = val_acc save_checkpoint(model, optimizer, args, epoch)
def train(self): set_logger(os.path.join(self.log_dir, 'train.log'), terminal=False) epochs = self.hps.num_epochs print_every = self.hps.print_every log_every = self.hps.log_summary_every lr = self.hps.learning_rate loss_avg = RunningAverage() summary_writer = SummaryWriter(log_dir=self.summ_dir) current_best_loss = 1e3 encoder_optimizer = optim.Adam(self.encoder.parameters(), lr=lr) decoder_optimizer = optim.Adam(self.decoder.parameters(), lr=lr) training_pairs = self.dl criterion = nn.NLLLoss(reduce=False) if self.hps.resume: log('- load ckpts...') self.load_state_dict() for epoch in trange(epochs, desc='epochs'): loss_avg.reset() with tqdm(total=len(training_pairs)) as progress_bar: for language_pair, mask_pair in training_pairs: language_pair, mask_pair = language_pair.to( self.device), mask_pair.to(self.device) loss = self.train_single(language_pair, mask_pair, encoder_optimizer, decoder_optimizer, criterion) loss_avg.update(loss.item()) self.global_step += 1 if self.global_step % log_every == 0: summary_writer.add_scalar('loss_value', loss, global_step=self.global_step) if self.global_step % print_every == 0: log('global step: {}, loss average: {:.3f}'.format( self.global_step, loss_avg())) progress_bar.set_postfix(loss_avg=loss_avg()) progress_bar.update() if loss_avg() < current_best_loss: log('new best loss average found, saving modules...') current_best_loss = loss_avg() state = { 'encoder': self.encoder.state_dict(), 'decoder': self.decoder.state_dict(), 'global_step': self.global_step, 'epoch': epoch, 'loss_avg': loss_avg() } torch.save(state, os.path.join(self.ckpt_dir, 'best.pth.tar'))
def run_train(self, dataset, ontology, args): model, tokenizer = self.bert, self.tokenizer batch_size = args.batch_size self.train() # Generate training examples turns = list(dataset['train'].iter_turns()) train_examples = [ turn_to_examples(t, ontology, tokenizer) for t in turns ] train_examples = list(itertools.chain.from_iterable(train_examples)) print('Generated training examples') # Random Oversampling # Note that: Most of the constructed examples are negative if args.random_oversampling: negative_examples, positive_examples = [], [] for example in train_examples: if example[-1] == 0: negative_examples.append(example) if example[-1] == 1: positive_examples.append(example) nb_negatives, nb_positives = len(negative_examples), len( positive_examples) sampled_positive_examples = random.choices(positive_examples, k=int(nb_negatives / 8)) train_examples = sampled_positive_examples + negative_examples print('Did Random Oversampling') print('Number of positive examples increased from {} to {}'.format( nb_positives, len(sampled_positive_examples))) # Initialize Optimizer num_train_iters = args.epochs * len( train_examples) / batch_size / args.gradient_accumulation_steps self.init_optimizer(args, num_train_iters) # Main training loop iterations = 0 best_dev_joint_goal = 0.0 train_avg_loss = RunningAverage() for epoch in range(args.epochs): print('Epoch {}'.format(epoch)) random.shuffle(train_examples) pbar = tqdm(range(0, len(train_examples), batch_size)) for i in pbar: iterations += 1 # Next training batch batch = train_examples[i:i + batch_size] _, _, input_ids, token_type_ids, labels = list(zip(*batch)) # Padding and Convert to Torch Tensors input_ids, input_masks = pad(input_ids, args.device) token_type_ids = pad(token_type_ids, args.device)[0] labels = torch.LongTensor(labels).to(args.device) # Calculate loss loss = model(input_ids, token_type_ids, input_masks, labels=labels) if args.n_gpus > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() train_avg_loss.update(loss.item()) # Update pbar pbar.update(1) pbar.set_postfix_str(f'Train Loss: {train_avg_loss()}') # parameters update if iterations % args.gradient_accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() # Evaluate on the dev set and the test set dev_results = self.run_dev(dataset, ontology, args) test_results = self.run_test(dataset, ontology, args) print('Evaluations after epoch {}'.format(epoch)) print(dev_results) print(test_results) if dev_results['joint_goal'] > best_dev_joint_goal: best_dev_joint_goal = dev_results['joint_goal'] self.save(args.output_dir) print('Saved the model')
def validate(args, model, test_loader, criterion_ueff, epoch, epochs, device='cpu'): with torch.no_grad(): val_si = RunningAverage() # val_bins = RunningAverage() metrics = utils.RunningAverageDict() for batch in tqdm(test_loader, desc=f"Epoch: {epoch + 1}/{epochs}. Loop: Validation" ) if is_rank_zero(args) else test_loader: img = batch['image'].to(device) depth = batch['depth'].to(device) if 'has_valid_depth' in batch: if not batch['has_valid_depth']: continue depth = depth.squeeze().unsqueeze(0).unsqueeze(0) bins, pred = model(img) mask = depth > args.min_depth l_dense = criterion_ueff(pred, depth, mask=mask.to(torch.bool), interpolate=True) val_si.append(l_dense.item()) pred = nn.functional.interpolate(pred, depth.shape[-2:], mode='bilinear', align_corners=True) pred = pred.squeeze().cpu().numpy() pred[pred < args.min_depth_eval] = args.min_depth_eval pred[pred > args.max_depth_eval] = args.max_depth_eval pred[np.isinf(pred)] = args.max_depth_eval pred[np.isnan(pred)] = args.min_depth_eval gt_depth = depth.squeeze().cpu().numpy() valid_mask = np.logical_and(gt_depth > args.min_depth_eval, gt_depth < args.max_depth_eval) if args.garg_crop or args.eigen_crop: gt_height, gt_width = gt_depth.shape eval_mask = np.zeros(valid_mask.shape) if args.garg_crop: eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height), int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1 elif args.eigen_crop: if args.dataset == 'kitti': eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height), int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1 else: eval_mask[45:471, 41:601] = 1 valid_mask = np.logical_and(valid_mask, eval_mask) metrics.update( utils.compute_errors(gt_depth[valid_mask], pred[valid_mask])) return metrics.get_value(), val_si
def race(self, driver): """ Let a driver race in a preconfigured quickrace :param driver: a driver object that generates actions based on sensors :return: driver fitness value after race """ if not self.connect(): raise IOError("could not connect to TORCS") start_time = timeit.default_timer() try: print "Start racing..." s = None lap_times = [] cur_lap_time = -10.0 timeout_reached = False recovery_lock = 0 max_speed = 0.0 avg_speed = RunningAverage() driver.prepare() while True: data = self.sock.recv(2048) if data.strip().startswith("("): s = SensorModel(string=data) action = driver.get_action(sensors=s) # save maximum speed for fitness function max_speed = max(max_speed, s['speedX']) avg_speed.add_value(float(s['speedX'])) # AUTORECOVERY: if off track, go backwards until back on track and then some more if self.auto_recover and (s.is_off_track() or recovery_lock > 0): action.gear = -1 action.accel = 0.4 action.clutch = 0.0 action.steering = s['angle'] / -2.0 if s.is_off_track(): recovery_lock = RECOVERY_LOCK else: recovery_lock -= 1 self.sock.sendto(str(action), self.server_address) if s['curLapTime'][0] < cur_lap_time: lap_times.append(cur_lap_time) print "lap %i: %0.2f seconds" % (len(lap_times), cur_lap_time) cur_lap_time = s['curLapTime'][0] else: if data.startswith("***shutdown***"): if s['curLapTime'][0] > 1: lap_times.append(s['curLapTime'][0]) print "--- END OF RACE --- finished at position %i, avg/max speed: %0.2f/%0.2f km/h" % ( int(s['racePos']), avg_speed.avg, max_speed) break if self.timeout is not None and s['curLapTime'] > self.timeout: print "--- RACE TIMEOUT REACHED ---" timeout_reached = True break if s is not None: print "lap times:", lap_times # print "distance raced:", s['distRaced'] return driver.compute_fitness(last_sensor=s, lap_times=lap_times, max_speed=max_speed, average_speed=avg_speed.avg, timeout_reached=timeout_reached) else: return 0.0 except KeyboardInterrupt: print "Exit client" except Exception as e: print "Client Error:", e finally: #print "race call took %0.1f seconds." % (timeit.default_timer() - start_time) self.close()
def test_running_average(): train_losses = [1, 0.5, 0.3] train_accuracies = [0.3, 0.5, 0.1] running_average = RunningAverage() for i in range(len(train_losses)): running_average.update_train_loss_avg(train_losses[i], 1) running_average.update_train_acc_avg(train_accuracies[i], 1) assert running_average.train_loss_run_avg == 0.6 assert running_average.train_acc_run_avg == 0.3 running_average.update_train_loss_avg(0.2, 1) running_average.update_train_acc_avg(0.1, 1) assert running_average.train_loss_run_avg == 0.5 assert running_average.train_acc_run_avg == 0.25 val_losses = [1, 0.7, 1.3] val_accuracies = [0.3, 0.4, 8.3] for i in range(len(val_losses)): running_average.update_val_loss_avg(val_losses[i], 1) running_average.update_val_acc_avg(val_accuracies[i], 1) assert running_average.val_loss_run_avg == 1 assert running_average.val_acc_run_avg == 3 running_average.update_val_loss_avg(3, 1) running_average.update_val_acc_avg(7, 1) assert running_average.val_loss_run_avg == 1.5 assert running_average.val_acc_run_avg == 4 running_average.reset_train() running_average.reset_val() assert running_average.sum_train_loss == 0 assert running_average.sum_train_acc == 0 assert running_average.train_loss_counter == 0 assert running_average.train_acc_counter == 0 assert running_average.train_loss_run_avg == 0 assert running_average.train_acc_run_avg == 0 assert running_average.sum_val_loss == 0 assert running_average.sum_val_acc == 0 assert running_average.val_loss_counter == 0 assert running_average.val_acc_counter == 0 assert running_average.val_loss_run_avg == 0 assert running_average.val_acc_run_avg == 0
def main(args): if args.use_gpu == 1: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num print("Using GPU", args.gpu_num) else: print("Not using GPU") train_dir = "train" val_dir = "val" long_dtype, float_dtype = get_dtypes(args) print("Initializing train dataset") train_dset, train_loader = data_loader(args.dataset_folder, train_dir, args.batch_size) print("Initializing val dataset") val_dset, val_loader = data_loader(args.dataset_folder, val_dir, args.batch_size) print("Training for %d" % args.num_epochs) print("Arguments", args.__dict__) model = PredictFromNightBaseline() # model = PredictFromDayBaseline() # model = PredictBaseline() model.type(float_dtype) print(model) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate) # criterion = nn.BCELoss() criterion = nn.CrossEntropyLoss() max_val_acc = 0.0 for epoch in range(args.num_epochs): gc.collect() # Train epoch model.train() loss_avg = RunningAverage() acc_avg = RunningAverage() with tqdm(total=len(train_loader)) as t: for i, train_batch in enumerate(train_loader): train_batch = [ tensor.cuda() if args.use_gpu else tensor for tensor in train_batch ] X_day, X_night, Y = train_batch out = model(X_day, X_night) loss = criterion(out, Y) optimizer.zero_grad() loss.backward() optimizer.step() acc_avg.update_step(calc_accuracy(out, Y), Y.shape[0]) loss_avg.update_step(loss.item(), Y.shape[0]) t.set_postfix(loss='{:05.3f}'.format(loss_avg()), acc='{:05.3f}'.format(acc_avg())) t.update() #why doesnt the code go here # Val metrics model.eval() val_loss = RunningAverage() val_acc = RunningAverage() for i, val_batch in enumerate(val_loader): val_batch = [ tensor.cuda() if args.use_gpu else tensor for tensor in val_batch ] X_day, X_night, Y = val_batch out = model(X_day, X_night) loss = criterion(out, Y) val_loss.update_step(loss.item(), Y.shape[0]) val_acc.update_step(calc_accuracy(out, Y), Y.shape[0]) metrics_string = "Loss: {:05.3f} ; Acc: {:05.3f}".format( loss_avg(), acc_avg()) val_metrics = "Loss: {:05.3f} ; Acc: {:05.3f}".format( val_loss(), val_acc()) print("Epoch [%d/%d] - Train -" % (epoch + 1, args.num_epochs), metrics_string, "- Val -", val_metrics) if val_acc() > max_val_acc and args.save_model_weights: torch.save(model.state_dict(), os.path.join(model_path, str(epoch)))
def train(config_name, gene_variant=None): # Prepare tokenizer, dataset, and model configs = get_configs(config_name, verbose=False) if configs['use_gene_features']: assert(not gene_variant is None) configs['gene_variant'] = gene_variant tokenizer = BertTokenizer.from_pretrained(configs['transformer'], do_basic_tokenize=False) train_set, dev_set, test_set = load_oneie_dataset(configs['base_dataset_path'], tokenizer) model = BasicCorefModel(configs) # Initialize the optimizer num_train_docs = len(train_set) epoch_steps = int(math.ceil(num_train_docs / configs['batch_size'])) num_train_steps = int(epoch_steps * configs['epochs']) num_warmup_steps = int(num_train_steps * 0.1) optimizer = model.get_optimizer(num_warmup_steps, num_train_steps) print('Initialized optimizer') # Main training loop best_dev_score, iters, batch_loss = 0.0, 0, 0 for epoch in range(configs['epochs']): #print('Epoch: {}'.format(epoch)) print('\n') progress = tqdm.tqdm(total=epoch_steps, ncols=80, desc='Train {}'.format(epoch)) accumulated_loss = RunningAverage() train_indices = list(range(num_train_docs)) random.shuffle(train_indices) for train_idx in train_indices: iters += 1 inst = train_set[train_idx] iter_loss = model(inst, is_training=True)[0] iter_loss /= configs['batch_size'] iter_loss.backward() batch_loss += iter_loss.data.item() if iters % configs['batch_size'] == 0: accumulated_loss.update(batch_loss) torch.nn.utils.clip_grad_norm_(model.parameters(), configs['max_grad_norm']) optimizer.step() optimizer.zero_grad() batch_loss = 0 # Update progress bar progress.update(1) progress.set_postfix_str('Average Train Loss: {}'.format(accumulated_loss())) progress.close() # Evaluation after each epoch print('Evaluation on the dev set', flush=True) dev_score = evaluate(model, dev_set, configs)['avg'] # Save model if it has better dev score if dev_score > best_dev_score: best_dev_score = dev_score # Evaluation on the test set print('Evaluation on the test set', flush=True) evaluate(model, test_set, configs) # Save the model save_path = os.path.join(configs['saved_path'], 'model.pt') torch.save({'model_state_dict': model.state_dict()}, save_path) print('Saved the model', flush=True)
val_loader = data.DataLoader(Dataset(fnms=val_fnms, argumentation=None), batch_size=BATCH_SIZE, num_workers=4) model = bbox_model() model.to(device) loss_fn = torch.nn.MSELoss(reduction='mean') optimizer = torch.optim.Adam(model.parameters(), lr=LR) OnPlateau = 0 EarlyStopping = 0 best_loss = float('inf') for i in range(EPOCHS): log.info('epoch {}'.format(i)) model.train() loss_trn_avg = RunningAverage() with tqdm(total=len(trn_loader)) as t: for imgs, labels in trn_loader: imgs, labels = imgs.to(device), labels.to(device) out = model(imgs) loss = loss_fn(out, labels) optimizer.zero_grad() loss.backward() optimizer.step() t.set_postfix(loss=loss.cpu().item()) t.update() loss_trn_avg.update(loss.cpu().item()) model.eval() loss_val_avg = RunningAverage()