def __init__( self, log_interval=10, lr=1e-5, use_cuda=False, verbose=0, log_tensorboard=False, path="rnd_model/", ): self.predictor = predictor_generator() self.target = target_generator() for param in self.target.parameters(): param.requires_grad = False self.target.eval() self.log_interval = log_interval self.optimizer = torch.optim.Adam(self.predictor.parameters(), lr=lr) self.loss_function = torch.nn.MSELoss(reduction='mean') self.device = torch.device('cuda' if use_cuda else 'cpu') self.target.to(self.device) self.predictor.to(self.device) self.running_stats = RunningMeanStd() self.verbose = verbose self.writer = SummaryWriter() if log_tensorboard else None self.n_iter = 0 self.save_path = path Path(path).mkdir(parents=True, exist_ok=True) self.early_stopping = EarlyStopping(save_dir=self.save_path)
def train(xtrain, ytrain, xvalid, yvalid, hidden_d, layers, dropout, learning_rate, n_epoch, pic_name, batch_size, device): def setup_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) torch.backends.cudnn.deterministic = True def plot_loss(train_loss, valid_loss): plt.figure(figsize=(20, 10)) plt.plot(train_loss, 'b', label='train_loss') plt.plot(valid_loss, 'r', label='valid_loss') plt.legend() # plt.show() plt.savefig(RESULT_SAVE_PATH + pic_name + '.jpg') train_dataset = TensorDataset(xtrain, ytrain) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) setup_seed(0) model = Model(input_dim=xtrain.shape[-1], hidden_dim=hidden_d, n_layer=layers, drop_out=dropout).to(device) criterion = torch.nn.L1Loss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, eps=1e-4) early_stopping = EarlyStopping(patience=50, verbose=True) train_loss = [] valid_loss = [] for epoch in range(n_epoch): train_loss_tmp = 0 for step, (batch_x, batch_y) in enumerate(train_loader): prediction = model(batch_x) loss = criterion(prediction, batch_y) optimizer.zero_grad() loss.backward() optimizer.step() train_loss_tmp += loss.data train_loss.append(train_loss_tmp / (step + 1)) model.eval() valid_output = model(xvalid) valid_loss_data = criterion(valid_output, yvalid) scheduler.step(valid_loss_data) valid_loss.append(valid_loss_data.data) print('EPOCH: %d, TRAINING LOSS: %f, VALIDATION LOSS: %f' % (epoch, train_loss_tmp / (step + 1), valid_loss_data)) early_stopping(valid_loss_data, model) if early_stopping.early_stop: print('Early stopped.') break model.train() plot_loss(train_loss, valid_loss) model.load_state_dict(torch.load('checkpoint.pt')) return model
def model_train(model, data, optimizer, criterion, epochs=NUM_EPOCHS, patience=20): model.train() model_type = 'pointwise' scores = [] train_losses = [] valid_losses = [] valid_indexes = [] ndcg_list = [] X = torch.tensor(data.train.feature_matrix, dtype=torch.float, requires_grad=False) # gets input Y = torch.tensor(data.train.label_vector, requires_grad=False) # gets correct output validation_data = torch.tensor(data.validation.feature_matrix, dtype=torch.float, requires_grad=False) # validation input validation_target = torch.tensor(data.validation.label_vector, requires_grad=False) # validation correct output # initialize the early_stopping object early_stopping = EarlyStopping(model_type, patience=patience, verbose=True, delta=0.0001) for epoch in tqdm(range(epochs), position=0, leave=True): optimizer.zero_grad() # set gradients to zero y_pred = model(X) # predict labels loss = criterion(y_pred, Y) # calculate loss loss.backward() # backpropagate loss optimizer.step() # update weights train_losses.append(loss.item()) # append loss to list to plot print("validation ndcg at epoch " + str(epoch)) model.eval() validation_y_pred = model(validation_data) validation_scores, validation_indexes = softmax_highest_score(validation_y_pred) scores.append(validation_scores) # calculate the loss valid_loss = criterion(validation_y_pred, validation_target) # record validation loss valid_losses.append(valid_loss.item()) valid_indexes.append(validation_indexes) results = eval.evaluate(data.validation, validation_scores, print_results=False) ndcg_list.append(results['ndcg']['mean']) # print('ndcg: ', results["ndcg"]) print("Epoch {} - train loss: {} - validation loss: {}".format(epoch, loss.item(), valid_loss.item())) # print loss if epoch % 5 == 0: # print performance of model on validation data epoch_len = len(str(epochs)) print_msg = (f'[{epoch:>{epoch_len}}/{epochs:>{epoch_len}}] ' + f'train_loss: {loss.item():.5f} ' + f'valid_loss: {valid_loss.item():.5f}') print(print_msg) # early_stopping checks if validation loss has decresed early_stopping(valid_loss.item(), model) if early_stopping.early_stop: print("Early stopping") break # load the last checkpoint with the best model model.load_state_dict(torch.load('models/{}_checkpoint.pt'.format(model_type))) return model, optimizer, scores, train_losses, valid_losses, ndcg_list, validation_indexes
def main(): #训练test_train----------- # print('[INFO] start training ') # train_losses, eval_losses, eval_r2s=[], [], [] train_losses, eval_losses = [], [] early_stopping = EarlyStopping(patience=PATIENCE, verbose=True) for epoch in range(NUM_EPOCH): print('[INFO] start training ') model.train() #启用batchnormalization和dropout train_loss = 0.0 #step_loss=0.0 for step, (_, train_tongue, train_label) in enumerate(train_loader): train_tongue, train_label = Variable( train_tongue).cuda(), Variable(train_label).cuda() optimizer.zero_grad() #梯度值初始化为0 output = model(train_tongue) loss = loss_func(output, train_label) loss.backward() #反向传播 optimizer.step() #更新参数 train_loss += float(loss.item() * train_tongue.size(0)) # print('Epoch:[%d/%d], Step:[%d/%d], Step loss: %.4f' % (epoch + 1, NUM_EPOCH, step + 1, len(train_datasets) // BATCH_SIZE, loss.item())) if step % 100 == 99: print('Epoch:[%d/%d], Step:[%d/%d], Step loss: %.4f' % (epoch + 1, NUM_EPOCH, step + 1, len(train_datasets) // BATCH_SIZE, loss.item())) #print('Epoch:[%d/%d], Step:[%d/%d], Average step loss:%.4f' % (epoch + 1, NUM_EPOCH, step + 1, len(train_datasets) // BATCH_SIZE, step_loss/50)) train_losses.append(train_loss / len(train_datasets)) print( '=====> Epoch:', epoch + 1, ' | Average epoch train loss: %.4f' % (train_loss / len(train_datasets))) adjust_lr(optimizer, epoch) #eval----------- print('[INFO] start evaluation') model.eval() #不启用batchnormalization和dropout with torch.no_grad(): # eval_loss,eval_r2 = 0.0, 0.0 eval_loss = 0.0 for step, (_, test_tongue, test_label) in enumerate(eval_loader): test_tongue, test_label = Variable( test_tongue).cuda(), Variable(test_label).cuda() output = model(test_tongue) loss = loss_func(output, test_label) eval_loss += float(loss.item() * test_tongue.size(0)) eval_losses.append(eval_loss / len(eval_datasets)) print( '=====> Epoch:', epoch + 1, ' | Average epoch eval loss: %.4f ' % (eval_loss / len(eval_datasets))) #print('=====> Epoch:',epoch+1, ' | Average epoch test loss:%.4f ' % (eval_loss/len(test_datasets)), '| average r2 :%.4f ' % (eval_r2/len(test_datasets))) print('[INFO] evaluation complete') # early_stopping(train_loss/len(train_datasets),model) early_stopping(eval_loss / len(test_datasets), model) if early_stopping.early_stop: print('[INFO] early stop') break return train_losses, eval_losses
def fit_siamese(train_loader, val_loader, model, loss_fn, optimizer, scheduler, patience, n_epochs, cuda, log_interval, metrics=[], start_epoch=0): """ Loaders, model, loss function and metrics should work together for a given task, i.e. The model should be able to process data output of loaders, loss function should process target output of loaders and outputs from the model Examples: Classification: batch loader, classification model, NLL loss, accuracy metric Siamese network: Siamese loader, siamese model, contrastive loss Online triplet learning: batch loader, embedding model, online triplet loss """ for epoch in range(0, start_epoch): scheduler.step() early_stopping = EarlyStopping(patience=patience, verbose=True) for epoch in range(start_epoch, n_epochs): scheduler.step() # Train stage train_loss, metrics = train_siamese(train_loader, model, loss_fn, optimizer, cuda, log_interval, metrics) Parameters.epoch += 1 message = 'Epoch: {}/{}. Train set: Average loss: {:.4f}'.format( epoch + 1, n_epochs, train_loss) for metric in metrics: message += '\t{}: {}'.format(metric.name(), metric.value()) val_loss, metrics = test_siamese(val_loader, model, loss_fn, cuda, metrics) val_loss /= len(val_loader) early_stopping(val_loss, model) for param in model.parameters(): print(param.data) if early_stopping.early_stop: print("Early stopping") break message += '\nEpoch: {}/{}. Validation set: Average loss: {:.4f}'.format( epoch + 1, n_epochs, val_loss) for metric in metrics: message += '\t{}: {}'.format(metric.name(), metric.value()) print(message)
def train(data, mine_net, mine_net_optim, resp=0, cond=1, batch_size=100, iter_num=int(1e+4), log_freq=int(1e+3), avg_freq=int(1e+2), verbose=True, patience=20): # data is x or y result = list() ma_et = 1. #Early Stopping train_losses = [] valid_losses = [] avg_train_losses = [] avg_valid_losses = [] earlyStop = EarlyStopping(patience=patience, verbose=True) trainData, validData = create_dataset(data, batch_size) for i in range(iter_num): #get train data batchTrain = sample_batch(trainData, resp, cond, batch_size=batch_size) mi_lb, ma_et = learn_mine(batchTrain, mine_net, mine_net_optim, ma_et) result.append(mi_lb.detach().cpu().numpy()) train_losses.append(result[-1].item()) if verbose and (i + 1) % (log_freq) == 0: print(result[-1]) batchValid = sample_batch(validData, resp, cond, batch_size=batch_size) mi_lb_valid = valid_mine(batchValid, mine_net) valid_losses.append(mi_lb_valid.item()) if (i + 1) % (avg_freq) == 0: train_loss = np.average(train_losses) valid_loss = np.average(valid_losses) avg_train_losses.append(train_loss) avg_valid_losses.append(valid_loss) print_msg = "[{0}/{1}] train_loss: {2} valid_loss: {3}".format( i, iter_num, train_loss, valid_loss) print(print_msg) train_losses = [] valid_losses = [] earlyStop(valid_loss, mine_net) if (earlyStop.early_stop): print("Early stopping") break mine_net.load_state_dict(torch.load('checkpoint.pt')) return mine_net, avg_train_losses, avg_valid_losses
def train(model, stock_ids): optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = torch.nn.MSELoss() scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=patience, verbose=verbose, cooldown=1, min_lr=min_lr, eps=min_lr) earlyStop = EarlyStopping(model_name, models_folder, patience=10) X_train, X_test, y_train, y_test = load_train_data(stock_ids) pbar = tqdm(range(0, max_epoch)) clean_models(model_name, models_folder) for epoch in pbar: optimizer.zero_grad() model.train() # forward + backward + optimize steps = y_train.shape[1] // len(predict_columns) dataset = BasicDataset(X_train, y_train) dataloader = DataLoader(dataset, batch_size=10240, shuffle=True, num_workers=0) total_train_loss = [] for _, items in enumerate(dataloader): train_outputs = model(items[0], steps) train_loss = criterion(train_outputs, items[1]) train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5, norm_type=2) optimizer.step() total_train_loss.append(train_loss) train_loss = torch.mean(torch.stack(total_train_loss)) with torch.no_grad(): model.eval() outputs = model(X_test, steps) validate_loss = criterion(outputs, y_test) if epoch % 100 == 99: earlyStop(validate_loss, model) if earlyStop.early_stop: break scheduler.step(train_loss) pbar.set_description("{0:.6f}, {1:.6f}".format(train_loss, validate_loss)) return model
def update_model(self, HL_replay_buffer, logger): early_stopper = EarlyStopping(patience=7) split = 10.0 state_norm = utils.normalization(HL_replay_buffer.obses, self.all_mean_var[0], self.all_mean_var[1]) action_norm = utils.normalization(HL_replay_buffer.actions, self.all_mean_var[2], self.all_mean_var[3]) delta_state_norm = utils.normalization(HL_replay_buffer.next_obses, self.all_mean_var[4], self.all_mean_var[5]) train_capacity = int(HL_replay_buffer.capacity * (split - 1) / split) test_idxs = np.arange(-int(HL_replay_buffer.capacity / split), 0) state_test = torch.as_tensor(state_norm[test_idxs], device=self.device).float() action_test = torch.as_tensor(action_norm[test_idxs], device=self.device).float() delta_state_test = torch.as_tensor(delta_state_norm[test_idxs], device=self.device).float() for i in range(self.model_update_steps): self.update_step += 1 idxs = np.random.randint(0, train_capacity, size=self.batch_size) # idxs = np.random.randint(0, 1100, size=self.batch_size) state = torch.as_tensor(state_norm[idxs], device=self.device).float() action = torch.as_tensor(action_norm[idxs], device=self.device).float() delta_state = torch.as_tensor(delta_state_norm[idxs], device=self.device).float() pred_delta_state = self.forward_model(state, action) model_loss = F.mse_loss(pred_delta_state, delta_state) self.model_optimizer.zero_grad() model_loss.backward() self.model_optimizer.step() logger.log('train/model_loss', model_loss) logger.dump(self.update_step) if (i + 1) % 100 == 0: pred_delta_state = self.forward_model(state_test, action_test) model_loss = F.mse_loss(pred_delta_state, delta_state_test) logger.log('train/val_loss', model_loss) logger.dump(self.update_step) early_stopper(model_loss) if early_stopper.early_stop: break self.save_data('.')
def train_model(self, last_hidden, last_rnn, train_loader, valid_loader): if last_rnn is not None: rnn_local = last_rnn else: rnn_local = GRUNet(len(forecast_strategy.factors_list)).to(self.device) optimizer_local = torch.optim.Adam(rnn_local.parameters(), lr=self.LR) # optimize all cnn parameters loss = nn.MSELoss().to(self.device) early_stopping = EarlyStopping(patience=20, verbose=True, trace_func=logging.info) rnn_local, h_state = self.train_rnn(early_stopping, last_hidden, loss, optimizer_local, rnn_local, train_loader, valid_loader) return early_stopping, h_state, loss, rnn_local
def main(): train_losses, eval_losses=[], [] train_lips_losses,train_tongue_losses,train_lipstongue_losses=[], [], [] early_stopping=EarlyStopping(patience=PATIENCE,verbose=True) for epoch in range(NUM_EPOCH): print('[INFO] start training ') autoencoder.train() train_loss, train_lips_loss, train_tongue_loss, train_lipstongue_loss=0.0, 0.0, 0.0, 0.0 for step, (train_lips, train_tongue, train_label) in enumerate(train_loader): train_lips, train_tongue, train_label = Variable(train_lips).cuda(), Variable(train_tongue).cuda(), Variable(train_label).cuda() optimizer.zero_grad() output, output_lips, output_tongue = autoencoder(train_lips,train_tongue) loss_lips=loss_func1(output_lips,train_lips) loss_tongue=loss_func2(output_tongue,train_tongue) loss_lipstongue=loss_func3(output,train_label) loss=loss_lipstongue+loss_lips+loss_tongue loss.backward() optimizer.step() train_loss += float(loss.item()*train_lips.size(0)) train_lips_loss += float(loss_lips.item()*train_lips.size(0)) train_tongue_loss += float(loss_tongue.item()*train_lips.size(0)) train_lipstongue_loss += float(loss_lipstongue.item()*train_lips.size(0)) if step%100==99: print('Epoch:[%d/%d], Step:[%d/%d], Step loss: %.4f' % (epoch + 1, NUM_EPOCH, step + 1, len(train_datasets) // BATCH_SIZE, loss.item())) train_losses.append(train_loss/len(train_datasets)) train_lips_losses.append(train_lips_loss/len(train_datasets)) train_tongue_losses.append(train_tongue_loss/len(train_datasets)) train_lipstongue_losses.append(train_lipstongue_loss/len(train_datasets)) print('=====> Epoch:',epoch+1, ' | Average epoch train loss total: %.4f' % (train_loss/len(train_datasets))) print('[INFO] start evaluation') autoencoder.eval() with torch.no_grad(): eval_loss=0.0 for step,(test_lips, test_tongue, test_label) in enumerate(eval_loader): test_lips, test_tongue, test_label = Variable(test_lips).cuda(), Variable(test_tongue).cuda(), Variable(test_label).cuda() output, output_lips, output_tongue = autoencoder(test_lips,test_tongue) loss_lips=loss_func1(output_lips,test_lips) loss_tongue=loss_func2(output_tongue,test_tongue) loss_lipstongue=loss_func3(output,test_label) loss=loss_lipstongue+loss_lips+loss_tongue eval_loss += float(loss.item()*test_lips.size(0)) eval_losses.append(eval_loss/len(eval_datasets)) print('=====> Epoch:',epoch+1, ' | Average epoch eval loss: %.4f ' % (eval_loss/len(eval_datasets))) print('[INFO] evaluation complete') # early_stopping(train_loss/len(train_datasets),autoencoder) early_stopping(eval_loss/len(test_datasets),autoencoder) if early_stopping.early_stop: print('[INFO] early stop') break # torch.save(encoder.state_dict(),'./autoencoder.pth') return train_losses, eval_losses, train_lips_losses, train_tongue_losses, train_lipstongue_losses
def train(model, optimizer, criterion, lr_scheduler, train_loader, valid_loader, test_loader, config): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") early_stopping = EarlyStopping(patience=5, verbose=True) valid_losses = [] print( f"Number of mini-batches: {len(train_loader)} for batch_size {BATCH_SIZE}" ) for epoch in range(20): running_loss = 0.0 model.train() for i, data in enumerate(train_loader, 0): inputs, labels = data[0].to(device), data[1].to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() if i % 200 == 0: print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 200)) running_loss = 0.0 test(model, test_loader) if config["use_lr_decay"]: print( f"Decreasing learning rate to {lr_scheduler.get_lr()}, i.e. {config['lr_decay_rate']**(epoch+1)*100}%" ) lr_scheduler.step() torch.save(model.state_dict(), f"model_epoch{epoch}.h5") wandb.save(f"model_epoch{epoch}.h5") model.eval() for data in valid_loader: inputs, labels = data[0].to(device), data[1].to(device) output = model(inputs) loss = criterion(output, labels) valid_losses.append(loss.item()) valid_loss = np.average(valid_losses) early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping") break print('Finished training.') test(model, test_loader, compute_confusion_matrix=True)
def train_model(self): dur = [] valid_losses = [] avg_train_losses = [] avg_valid_losses = [] epoch_train_loss = [] optimizer = torch.optim.Adam(self.parameters(), lr=1e-2) early_stopping = EarlyStopping(patience=self.params["patience"], verbose=True) for epoch in range(self.params["n_epochs"]): if epoch >= 3: t0 = time.time() self.train() logits = self() logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp[self.train_mask], self.labels[self.train_mask]) epoch_train_loss.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) acc = calculate_accuracy(*self.evaluate()) epoch_train_loss_mean = np.mean(epoch_train_loss) print( "Epoch {:05d} | loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}" .format(epoch, loss.item(), acc, np.mean(dur))) with torch.no_grad(): logits = self() logp = F.log_softmax(logits, 1) loss_val = F.nll_loss(logp[self.test_mask], self.labels[self.test_mask]) valid_losses.append(loss_val.item()) valid_loss = np.average(valid_losses) epoch_len = len(str(self.params["n_epochs"])) train_losses = [] valid_losses = [] early_stopping(valid_loss, self) if early_stopping.early_stop: print("Early stopping") break return self, avg_train_losses, avg_valid_losses
def __init__(self, model: nn.Module, train_dataloader: DataLoader, valid_dataloader, optimizer: Optimizer, loss: AbstractLoss, early_stopping_patience=7, model_backup_destination="./", resume=False, gradient_clipping_value=None): self.model: nn.Module = model self.train_dataloader: DataLoader = train_dataloader self.valid_dataloader: DataLoader = valid_dataloader self.optimizer: Optimizer = optimizer # Loss used for benchmarking agaisnt other runs only in case the loss function from which backprop is computed changes self.benchmark_MSE_loss: AbstractLoss = BatchSegmentMSELoss() # Custom loss is used for backpropagating self.custom_loss: AbstractLoss = loss self.gradient_clipping_value = gradient_clipping_value self.model_backup_destination = self._get_backup_destination( model_backup_destination, model, train_dataloader, optimizer, loss) self.early_stopper = EarlyStopping( patience=early_stopping_patience, verbose=True, destination_path=self.model_backup_destination) if resume: CometLogger.print("Resuming the training of {}".format( self.model_backup_destination)) CometLogger.print( "Overriding the Model and Optimizer's state dictionaries with the checkpoint's dicts" ) self.model.load_state_dict( self.early_stopper.load_model_checkpoint()) self.optimizer.load_state_dict( self.early_stopper.load_optimizer_checkpoint())
def train(model, stock_id): optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = torch.nn.MSELoss() scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=patience, verbose=verbose, cooldown=1, min_lr=min_lr, eps=1e-05) X_train, X_test, y_train, y_test = load_train_data(stock_id) pbar = tqdm(range(0, max_epoch)) earlyStop = EarlyStopping(get_model_name(stock_id), models_folder, patience=4, delta=0.00001) clean_models(get_model_name(stock_id), models_folder) for epoch in pbar: optimizer.zero_grad() # forward + backward + optimize train_outputs = model(X_train) train_loss = criterion(train_outputs, y_train) train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 10, norm_type=2) optimizer.step() with torch.no_grad(): outputs = model(X_test) validate_loss = criterion(outputs, y_test) if epoch % 100 == 99: earlyStop(validate_loss, model) if earlyStop.early_stop: break pbar.set_description("{0}:{1:.6f}, {2:.6f}".format( stock_id, train_loss, validate_loss)) scheduler.step(validate_loss) return model
def train_model(args, model, training_data, validation_data): logger.info("training model") optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) loss_history = [] loss_val_history = [] f1_history = [] f1_val_history = [] # initialize the early_stopping object early_stopping = EarlyStopping(verbose=False, patience=args.patience, epsilon=args.epsilon) for i in range(1, args.num_epochs + 1): loss, loss_val, f1, f1_val = train_epoch(i, model, training_data, optimizer, args, validation_data) loss_history.append(loss) loss_val_history.append(loss_val) f1_history.append(f1) f1_val_history.append(f1_val) print("Epoch # %d" % i) print("Loss : Training- %.5f" % loss, ", Validation- %.5f" % loss_val) #print("Validation loss in epoch %d is:" % i, loss_val) print("F1 score : Training- %.5f" % f1, ", Validation- %.5f" % f1_val) #print("Validation f1 in epoch %d is:" % i, f1_val) # early stopping early_stopping(loss_val, model) if early_stopping.early_stop: print("Early stopping") break if args.plot_loss: stats = { 'loss_history': loss_history, 'loss_val_history': loss_val_history } plot_loss(stats) stats = {'f1_history': f1_history, 'f1_val_history': f1_val_history} plot_f1(stats) if args.is_save_model: save_model(args, model)
def multi_train(model, train_loader, validate_loader, train_batch_size=16, validate_batch_size=32, patience=3, epoches=30, lr=1e-5, weight=None): # for storing individual gene mse loss temp_lis = [] # for storing total mse loss during training train_loss_lis = [] # for storing total mse loss during validate validate_loss_lis = [] # for storing individual mse loss during validation val_loss_ind_lis = [] # define early stopping early_stopping = EarlyStopping(patience=patience, verbose=False) for i in range(epoches): train_loss = train(model, train_loader, train_batch_size, lr=lr, weight=weight) train_loss_lis.append(train_loss) val_loss, val_loss_ind = validate(model, validate_loader, validate_batch_size) validate_loss_lis.append(val_loss) val_loss_ind_lis.append(val_loss_ind) early_stopping(val_loss, model) if early_stopping.early_stop: print("Early stopping") break return train_loss_lis, validate_loss_lis, val_loss_ind_lis
def main_worker(epochs,best_model_spec, checkpoints_files,args,device): global ARCH_NAME ARCH_NAME = best_model_spec["model_name"] data_transforms = transforms.Compose([ToTensorRescale()]) train_dataset = CatDogsDataset(CAT_TRAIN_PATH, DOG_TRAIN_PATH, transform=data_transforms) val_dataset = CatDogsDataset(CAT_VAL_PATH, DOG_VAL_PATH, transform=data_transforms) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True) model_name = best_model_spec["model_name"] start_epoch = 0 model, optimizer = build_model(model_name,best_model_spec) model.to(device) losses_dict= {'train': {}, 'test': {}, 'accuracy': {}} criterion = nn.NLLLoss() if LOAD_CHECKPOINT: model, start_epoch, optimizer, losses_dict = load_checkpoint(model,optimizer,checkpoints_files[-1]) early_stopping = EarlyStopping(patience=4, verbose=True) for e in range(start_epoch,epochs): print("{} out of {}".format(e+1, epochs)) time.sleep(1) model, train_loss = train(train_dataloader, model, criterion, optimizer, epochs,device) model, test_loss, test_accuracy = validate(val_dataloader, model, criterion,device) current_metrics = [e,train_loss, test_loss,test_accuracy] losses_dict["train"][e] = train_loss losses_dict["test"][e] = test_loss losses_dict["accuracy"][e] = test_accuracy if early_stopping.early_stop: break if e % 2 == 0: checkpoints_files = save_checkpoint(model,optimizer, current_metrics, checkpoints_files, losses_dict) return checkpoints_files
def objective(params, epochs=1500): global ITERATION ITERATION += 1 params['loss_weights'] = loss_weights params['num_classes'] = len(loss_weights) params['rnn_hid_size'] = int(params['rnn_hid_size']) data_train, label_count = data_loader.get_loader(filename=datapath, indices=train_idx, batch_size=batch_size) data_val = data_loader.get_loader(filename=datapath, indices=val_idx, batch_size=batch_size) model = myrits.Model() print(params) model.set_params(**params) model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=model.lr) early_stopping = EarlyStopping(patience=20, verbose=True, save_mode=1, runname='run_{}'.format(ITERATION), save_path=args.savepath) start = timer() val_loss = float('Inf') accuracy = 0.0 for epoch in range(1, epochs+1): time_glob = time.time() train(model, data_train, optimizer, epoch) stop_early, val_loss = evaluate(model, data_val, epoch, early_stopping, ITERATION) time_ep = time.time() - time_glob print('Epoch time {}'.format(time_ep)) if stop_early: break run_time = timer() - start of_connection = open(out_file, 'a') writer = csv.writer(of_connection) writer.writerow([val_loss, params, epoch, ITERATION, run_time]) of_connection.close() return {'loss': val_loss, 'params': params, 'iteration': ITERATION, 'train_time': run_time, 'status': STATUS_OK}
def train(self, n_epochs=5): self.change_model() self.model = self.model.to(self.device) train_losses = [] val_losses = [] pth = './gz2hub_checkpoints/gz2hubcheckpoint' early_stopping = EarlyStopping(patience=10, verbose=True, path=pth) print('Training beginning') for epoch in range(n_epochs): train_loss = self.train_phase(tr='train') train_losses.append(train_loss) print("Epoch: {} Train Loss: {}".format(epoch + 1, train_loss)) val_loss = self.train_phase(tr='val') self.scheduler.step(val_loss) val_losses.append(val_loss) print("Epoch: {} Val Loss: {}".format(epoch + 1, val_loss)) early_stopping(val_loss, self.model, epoch) if early_stopping.early_stop: ep = epoch - 10 self.model.load_state_dict( torch.load( './gz2hub_checkpoints/gz2hubcheckpoint{}.pt'.format( ep))) print("Early stopping") break pickle.dump(train_losses, open('./losses/gz2hub_train', 'wb')) pickle.dump(val_losses, open('./losses/gz2hub_val', 'wb')) torch.save(self.model, self.savePath) print('Model saved: ' + self.savePath) plt.plot(train_losses, label='Training loss') plt.plot(val_losses, label='Validation loss') plt.legend(frameon=False) plt.show() print("Training complete") return None
def train(self, n_epochs=5): self.model = self.model.to(self.device) train_losses = [] val_losses = [] pth = './gz2_checkpoints/gz2checkpoint' early_stopping = EarlyStopping(patience=5, verbose=True, path=pth) print("Training beginning") for epoch in range(n_epochs): train_loss = self.train_phase(tr='train') train_losses.append(train_loss) print("[TST] Epoch: {} Train Loss: {}".format( epoch + 1, train_loss)) torch.cuda.empty_cache() val_loss = self.train_phase(tr='val') val_losses.append(val_loss) print("[TST] Epoch: {} Val Loss: {}".format(epoch + 1, val_loss)) early_stopping(val_loss, self.model, epoch) if early_stopping.early_stop: ep = epoch - 10 self.model.load_state_dict( torch.load( './gz2_checkpoints/gz2checkpoint{}.pt'.format(ep))) print("Early stopping") break pickle.dump(train_losses, open('./losses/gz2_train', 'wb')) pickle.dump(val_losses, open('./losses/gz2_val', 'wb')) torch.save(self.model, self.savePath) print('Model saved: ' + self.savePath) plt.plot(train_losses, label='Training loss') plt.plot(val_losses, label='Validation loss') plt.legend(frameon=False) plt.show() print("Training complete, evaluation on unseen set beginning") unseen_loss = self.unseen_phase() print('Unseen loss: {}'.format(unseen_loss)) return None
def validation(epoch): global log,best_loss,best_acc train_loader, val_loader= tumor_dataset() net.eval() val_loss=0 correct=0 total=0 early_stop = EarlyStopping(patience=10,verbose=True) with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(val_loader): inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) val_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() if best_acc<100. * correct / total: best_acc=100. * correct / total print_log('Update best acc : {:<5.3f}'.format(best_acc),log) if (val_loss / (batch_idx + 1)) < best_loss: best_loss = (val_loss / (batch_idx + 1)) print_log('Save best model | Loss : {}| Acc : {}'.format(val_loss / (batch_idx + 1), 100. * correct / total), log) torch.save(net, './{}/{}_model.pth'.format(log_folder, args.teacher_backbone)) torch.save(net.state_dict(), './{}/{}_weight.pth'.format(log_folder, args.teacher_backbone)) print_log('Teacher Val |Batch_idx:{:<3d}|Val Loss :{:<8.3f}|Val Acc:{:<8.3f}'.format(batch_idx, (val_loss / (batch_idx + 1)), 100. * correct / total),log) writer.add_scalar('val/loss', (val_loss / (batch_idx + 1)), epoch) writer.add_scalar('val/acc', (100. * correct / total), epoch) early_stop(val_loss, net) while early_stop.early_stop: print_log("Early stop",log) writer.close() log.close() break
def train(self, model_file, pretrain_file, get_loss_CNN, get_loss_Attn_LSTM, evalute_CNN_SSL, pseudo_labeling,evalute_Attn_LSTM,evalute_CNN,evalute_Attn_LSTM_SSL, generating_lexiocn, data_parallel=False): """ Train Loop """ self.model.train() # train mode self.load3(model_file, pretrain_file) self.model2.train() # train mode model = self.model.to(self.device) model2 = self.model2.to(self.device) t = self.kkk if(self.dataName == 'IMDB'): rnn_save_name = "./IMDB_model_save/checkpoint_RNN"+str(t)+".pt" cnn_save_name = "./IMDB_model_save/checkpoint_CNN"+str(t)+".pt" result_name = "./result/result_IMDB.txt" pseudo_name = "./result/pseudo_train_set_IMDB.txt" elif(self.dataName == "AGNews"): rnn_save_name = "./AGNews_model_save/checkpoint_RNN"+str(t)+".pt" cnn_save_name = "./AGNews_model_save/checkpoint_CNN"+str(t)+".pt" result_name = "./result/result_AGNews.txt" pseudo_name = "./result/pseudo_train_set_AGNews.txt" elif(self.dataName == "DBpedia"): rnn_save_name = "./DBpedia_model_save/checkpoint_RNN"+str(t)+".pt" cnn_save_name = "./DBpedia_model_save/checkpoint_CNN"+str(t)+".pt" result_name = "./result/result_DBpedia.txt" pseudo_name = "./result/pseudo_train_set_DBpedia.txt" elif(self.dataName == "yahoo"): rnn_save_name = "./yahoo_model_save/checkpoint_RNN"+str(t)+".pt" cnn_save_name = "./yahoo_model_save/checkpoint_CNN"+str(t)+".pt" result_name = "./result/result_yahoo.txt" pseudo_name = "./result/pseudo_train_set_yahoo.txt" num_a=0 global_step = 0 # global iteration steps regardless of epochs global_step3 = 0 before = -50 curTemp=0 print("self.cfg.n_epochs#:", self.cfg.n_epochs) ddf = open(result_name,'a', encoding='UTF8') ddf.write("############################################"+str(t)+": ramdom_samplimg###########################################"+'\n') ddf.close() ddf = open(pseudo_name,'a', encoding='UTF8') ddf.write("############################################"+str(t)+": ramdom_samplimg###########################################"+'\n') ddf.close() for e in range(self.cfg.n_epochs): if(e==0): temp=987654321 early_stopping = EarlyStopping(patience=10, verbose=True) valid_losses = [] while(1): self.optimizer = optim.optim4GPU(self.cfg, model, len(self.data_iter3_b)) global_step = 0 # global iteration steps regardless of epochs global_step3 = 0 loss_sum = 0. # the sum of iteration losses to get average loss in every epoch iter_bar = tqdm(self.data_iter3_b, desc='Iter (loss=X.XXX)') model.train() for i, batch in enumerate(iter_bar): batch = [t.to(self.device) for t in batch] self.optimizer.zero_grad() loss = get_loss_CNN(model, batch, global_step).mean() # mean() for Data Parallelism loss.backward() self.optimizer.step() global_step += 1 loss_sum += loss.item() iter_bar.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) model.eval()# evaluation mode loss_sum = 0. global_step3 = 0 iter_bar_dev = tqdm(self.dataset_dev_b, desc='Iter (loss=X.XXX)') self.optimizer = optim.optim4GPU(self.cfg, model, len(self.dataset_dev_b)) for i, batch in enumerate(iter_bar_dev): batch = [t.to(self.device) for t in batch] loss = get_loss_CNN(model, batch,global_step3).mean() # mean() for Data Parallelism valid_losses.append(loss.item()) global_step3 += 1 loss_sum += loss.item() iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss = np.average(valid_losses) loss_min=early_stopping(valid_loss, model,"./model_save/checkpoint_BERT_real.pt") valid_losses = [] if early_stopping.early_stop: print("Early stopping") break model.load_state_dict(torch.load("./model_save/checkpoint_BERT_real.pt")) print("Early stopping") model.eval()# evaluation mode p=[] l=[] p3=[] p2=[] iter_bar = tqdm(self.data_iter2_b, desc='Iter (f1-score=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = evalute_CNN(model, batch) # accuracy to print softmax = nn.Softmax() y_pred3 = softmax(y_pred1) #print("y_pred3#:", y_pred3) y_pred33, y_pred1 = torch.max(y_pred3, 1) print(y_pred1) p2.append(np.ndarray.flatten(y_pred3[:, 1].data.cpu().numpy())) p.append(np.ndarray.flatten(y_pred1.data.cpu().numpy())) l.append(np.ndarray.flatten(label_id.data.cpu().numpy())) result2 = 0 iter_bar.set_description('Iter(roc=%5.3f)'%result2) p2 = [item for sublist in p2 for item in sublist] p = [item for sublist in p for item in sublist] l = [item for sublist in l for item in sublist] p=np.array(p) l=np.array(l) F1score = f1_score(l,p,average='micro') accur = accuracy_score(l,p) ddf = open(result_name,'a', encoding='UTF8') ddf.write(str(t)+": "+ str(num_a)+"aucr: "+str(accur)+"f1-score: "+str(F1score)+'\n') ddf.close() num_a+=1 temp=987654321 early_stopping = EarlyStopping(patience=30, verbose=True) valid_losses = [] while(1): model2.train() loss_sum = 0 global_step3 = 0 iter_bar3 = tqdm(self.data_iter3, desc='Iter (loss=X.XXX)') for i, batch in enumerate(iter_bar3): batch = [t.to(self.device) for t in batch] loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism self.optimizer2.zero_grad() loss.backward() self.optimizer2.step() global_step3 += 1 loss_sum += loss.item() iter_bar3.set_description('Iter (loss=%5.3f)'%loss.item()) if global_step3 % self.cfg.save_steps == 0: # save self.save(global_step3) if self.cfg.total_steps and self.cfg.total_steps < global_step3: print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) print('The Total Steps have been reached.') self.save(global_step3) # save and finish when global_steps reach total_steps return print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) model2.eval() loss_sum = 0. global_step3 = 0 iter_bar_dev = tqdm(self.dataset_dev, desc='Iter (loss=X.XXX)') for i, batch in enumerate(iter_bar_dev): batch = [t.to(self.device) for t in batch] loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism valid_losses.append(loss.item()) global_step3 += 1 loss_sum += loss.item() iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item()) if global_step3 % self.cfg.save_steps == 0: # save self.save(global_step3) if self.cfg.total_steps and self.cfg.total_steps < global_step3: print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) print('The Total Steps have been reached.') self.save(global_step3) # save and finish when global_steps reach total_steps return print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss = np.average(valid_losses) loss_min=early_stopping(valid_loss, model2,"./model_save/checkpoint_LSTM_real.pt") valid_losses = [] if early_stopping.early_stop: print("Early stopping") break model2.eval() p=[] l=[] p3=[] iter_bar4 = tqdm(self.data_iter2, desc='Iter (f1-score=X.XXX)') global_step3=0 for batch in iter_bar4: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = evalute_Attn_LSTM(model2, batch, global_step3,len(iter_bar4))# accuracy to print _, y_pred3 = y_pred1.max(1) global_step3+=1 p2=[] l2=[] for i in range(0,len(y_pred3)): p3.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy())) l.append(np.ndarray.flatten(label_id[i].data.cpu().numpy())) p2.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy())) l2.append(np.ndarray.flatten(label_id[i].data.cpu().numpy())) p2 = [item for sublist in p2 for item in sublist] l2 = [item for sublist in l2 for item in sublist] result2 = f1_score(l2, p2,average='micro') iter_bar4.set_description('Iter(roc=%5.3f)'%result2) p3 = [item for sublist in p3 for item in sublist] l = [item for sublist in l for item in sublist] p=np.array(p) l=np.array(l) results2 = accuracy_score(l, p3) F1score = f1_score(l,p3,average='micro') ddf = open(result_name,'a', encoding='UTF8') ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(results2)+"f1-score: "+str(F1score)+'\n') ddf.close() num_a+=1 elif(e%2==1): global_step1 = 0 model2.eval() labell=[] iter_bar = tqdm(self.data_iter, desc='Iter (loss=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = generating_lexiocn(model2, batch,global_step1,len(iter_bar),e) # accuracy to print global_step1+=1 global_step1 = 0 model.eval() labell=[] iter_bar = tqdm(self.data_iter_b, desc='Iter (loss=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = evalute_CNN_SSL(model, batch,global_step1) # accuracy to print global_step1+=1 global_step1 = 0 model2.eval() sen = [] labell=[] iter_bar = tqdm(self.data_iter, desc='Iter (loss=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1,result_label,result3,data_temp, data_temp_b, data_iter_temp_na, data_iter_temp_na_b = pseudo_labeling( model2,batch,global_step1,len(iter_bar),e) # accuracy to print global_step1+=1 self.data_iter_temp = data_temp self.data_iter_temp_b = data_temp_b self.data_iter = data_iter_temp_na self.data_iter_b = data_iter_temp_na_b #print(result3) num_good=0 num_label=0 num_label1=0 ddf = open(pseudo_name,'a', encoding='UTF8') for i in range(0, len(result3)): sen.append(result3[i]) num_label=0 num_label1=0 num_good = 0 for i in range(0, len(result3)): if(result3[i] != -1): num_good +=1 if(result3[i] == result_label[i]): num_label+=1 ddf.write(str(t)+" " +"number of good :"+str(num_good)+" ") ddf.write("number of label :"+str(num_label)+" ") ddf.write("\n") ddf.close() print("num_good#:", num_good) print("before#:", before) if(num_good < self.stopNum): curTemp+=1 else: curTemp=0 if(curTemp>=2): break elif(e%2==0 ): self.model.train() # train mode self.load3(model_file, pretrain_file) model = self.model.to(self.device) b=0 early_stopping = EarlyStopping(patience=1, verbose=True) valid_losses = [] bb=987654321 while(1): self.optimizer = optim.optim4GPU(self.cfg, model, len(self.data_iter_temp_b)) iter_bar = tqdm(self.data_iter_temp_b, desc='Iter (loss=X.XXX)') model.train() global_step = 0 global_step3 = 0 valid_losses2 = [] for i, batch in enumerate(iter_bar): batch = [t.to(self.device) for t in batch] self.optimizer.zero_grad() loss = get_loss_CNN(model, batch, global_step).mean() # mean() for Data Parallelism valid_losses2.append(loss.item()) loss.backward() self.optimizer.step() global_step += 1 loss_sum += loss.item() iter_bar.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss2 = np.average(valid_losses2) bb= min(bb, valid_loss2.item()) valid_losses2 = [] model.eval()# evaluation mode loss_sum = 0. global_step3 = 0 iter_bar_dev = tqdm(self.dataset_dev_b, desc='Iter (loss=X.XXX)') self.optimizer = optim.optim4GPU(self.cfg, model, len(self.dataset_dev_b)) for i, batch in enumerate(iter_bar_dev): batch = [t.to(self.device) for t in batch] loss = get_loss_CNN(model, batch,global_step3).mean() # mean() for Data Parallelism valid_losses.append(loss.item()) global_step3 += 1 loss_sum += loss.item() iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss = np.average(valid_losses) loss_min=early_stopping(valid_loss, model,cnn_save_name) valid_losses = [] if early_stopping.early_stop: print("Early stopping") break model.load_state_dict(torch.load(cnn_save_name)) model.eval()# evaluation mode self.model.eval()# evaluation mode p=[] l=[] p3=[] p2=[] iter_bar = tqdm(self.data_iter2_b, desc='Iter (f1-score=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = evalute_CNN(model, batch) # accuracy to print softmax = nn.Softmax() y_pred3 = softmax(y_pred1) y_pred33, y_pred1 = torch.max(y_pred3, 1) p2.append(np.ndarray.flatten(y_pred3[:, 1].data.cpu().numpy())) p.append(np.ndarray.flatten(y_pred1.data.cpu().numpy())) l.append(np.ndarray.flatten(label_id.data.cpu().numpy())) result2 = 0 iter_bar.set_description('Iter(roc=%5.3f)'%result2) p2 = [item for sublist in p2 for item in sublist] p = [item for sublist in p for item in sublist] l = [item for sublist in l for item in sublist] p=np.array(p) l=np.array(l) F1score = f1_score(l,p,average='micro') accur = accuracy_score(l,p) ddf = open(result_name,'a', encoding='UTF8') ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(accur)+"f1-score: "+str(F1score)+'\n') ddf.close() num_a+=1 valid_losses = [] temp = 987654321 early_stopping = EarlyStopping(patience=10, verbose=True) while(1): model2.train() l=0 l_sum=0 loss_sum = 0 global_step3 = 0 iter_bar3 = tqdm(self.data_iter_temp, desc='Iter (loss=X.XXX)') for i, batch in enumerate(iter_bar3): batch = [t.to(self.device) for t in batch] loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism self.optimizer2.zero_grad() loss.backward() self.optimizer2.step() global_step3 += 1 loss_sum += loss.item() iter_bar3.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) model2.eval() loss_sum = 0. global_step3 = 0 iter_bar_dev = tqdm(self.dataset_dev, desc='Iter (loss=X.XXX)') for i, batch in enumerate(iter_bar_dev): batch = [t.to(self.device) for t in batch] loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism valid_losses.append(loss.item()) global_step3 += 1 loss_sum += loss.item() iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss = np.average(valid_losses) loss_min=early_stopping(valid_loss, model2,rnn_save_name) valid_losses = [] if early_stopping.early_stop: print("Early stopping") break model2.load_state_dict(torch.load(rnn_save_name)) model2.eval() p=[] l=[] p3=[] iter_bar4 = tqdm(self.data_iter2, desc='Iter (f1-score=X.XXX)') for batch in iter_bar4: batch = [t.to(self.device) for t in batch] with torch.no_grad(): label_id, y_pred1 = evalute_Attn_LSTM_SSL(model2, batch) _, y_pred3 = y_pred1.max(1) p2=[] l2=[] for i in range(0,len(y_pred3)): p3.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy())) l.append(np.ndarray.flatten(label_id[i].data.cpu().numpy())) p2.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy())) l2.append(np.ndarray.flatten(label_id[i].data.cpu().numpy())) p2 = [item for sublist in p2 for item in sublist] l2 = [item for sublist in l2 for item in sublist] result2 = f1_score(l2, p2,average='micro') iter_bar4.set_description('Iter(roc=%5.3f)'%result2) p3 = [item for sublist in p3 for item in sublist] l = [item for sublist in l for item in sublist] p=np.array(p) l=np.array(l) results2 = accuracy_score(l, p3) F1score = f1_score(l,p3,average='micro') ddf = open(result_name,'a', encoding='UTF8') ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(results2)+"f1-score: "+str(F1score)+'\n') ddf.close() num_a+=1
print_step = 250 # save_steps = print_step if not eval_model: write_train_para(writer, config) logger.info('------Training START--------') running_avg_loss, running_avg_rl_loss = 0, 0 sum_total_reward = 0 step = 0 step = load_step + step start_ep = int(load_step / save_steps) # initialize the early_stopping object early_stopping = EarlyStopping(config, logger, vocab, loggerName, patience=3, verbose=True) try: for epoch in range((start_ep + 1), config.max_epochs + 1): for batch in train_loader: step += 1 loss_st = time.time() inner_c, package = get_package(batch) if inner_c: continue parallel_model.module.train() mle_loss, pred_probs = train_one(package) if config.train_rl: rl_loss, batch_reward = train_one_rl(package, batch) if step % print_step == 0:
def train_free(): # Scale and initialize the parameters best_prec1 = 0 configs.TRAIN.epochs = int( math.ceil(configs.TRAIN.epochs / configs.ADV.n_repeats)) configs.ADV.fgsm_step /= configs.DATA.max_color_value configs.ADV.clip_eps /= configs.DATA.max_color_value # Create output folder if not os.path.isdir(os.path.join('trained_models', configs.output_name)): os.makedirs(os.path.join('trained_models', configs.output_name)) # Log the config details logger.info(pad_str(' ARGUMENTS ')) for k, v in configs.items(): print('{}: {}'.format(k, v)) logger.info(pad_str('')) # Create the model if configs.pretrained: print("=> using pre-trained model '{}'".format(configs.TRAIN.arch)) model = models.__dict__[configs.TRAIN.arch](pretrained=True) else: print("=> creating model '{}'".format(configs.TRAIN.arch)) model = models.__dict__[configs.TRAIN.arch]() # Wrap the model into DataParallel model = torch.nn.DataParallel(model).cuda() # Criterion: criterion = nn.CrossEntropyLoss().cuda() # Optimizer: optimizer = torch.optim.SGD(model.parameters(), configs.TRAIN.lr, momentum=configs.TRAIN.momentum, weight_decay=configs.TRAIN.weight_decay) # Resume if a valid checkpoint path is provided if configs.resume: if os.path.isfile(configs.resume): print("=> loading checkpoint '{}'".format(configs.resume)) checkpoint = torch.load(configs.resume) configs.TRAIN.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( configs.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(configs.resume)) # setup data loader transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(configs.DATA.cifar10_mean, configs.DATA.cifar10_std) ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(configs.DATA.cifar10_mean, configs.DATA.cifar10_std) ]) train_dataset = torchvision.datasets.CIFAR10(root='../data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=configs.DATA.batch_size, shuffle=True, num_workers=configs.DATA.workers, pin_memory=True, sampler=None) testset = torchvision.datasets.CIFAR10(root='../data', train=False, download=True, transform=transform_test) val_loader = torch.utils.data.DataLoader( testset, batch_size=configs.DATA.batch_size, shuffle=False, num_workers=configs.DATA.workers, pin_memory=True) # If in evaluate mode: perform validation on PGD attacks as well as clean samples if configs.evaluate: logger.info(pad_str(' Performing PGD Attacks ')) for pgd_param in configs.ADV.pgd_attack: validate_pgd(val_loader, model, criterion, pgd_param[0], pgd_param[1], configs, logger) validate(val_loader, model, criterion, configs, logger) return early_stopping = EarlyStopping(patience=15, verbose=True) for epoch in range(configs.TRAIN.start_epoch, configs.TRAIN.epochs): adjust_learning_rate(configs.TRAIN.lr, optimizer, epoch, configs.ADV.n_repeats) # train for one epoch do_train_free(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1, stopped, early_stopping = validate(val_loader, model, criterion, configs, logger, early_stopping) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': configs.TRAIN.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, os.path.join('trained_models', configs.output_name)) if (stopped): break # Automatically perform PGD Attacks at the end of training logger.info(pad_str(' Performing PGD Attacks ')) for pgd_param in configs.ADV.pgd_attack: validate_pgd(val_loader, model, criterion, pgd_param[0], pgd_param[1], configs, logger)
experiment.log_parameters(parameters) experiment.set_name(opts.exp_name) models_path = "models/" use_gpu = parameters['use_gpu'] mapping_file = 'models/mapping.pkl' name = parameters['name'] model_name = models_path + name #get_name(parameters) if not os.path.exists(models_path): os.makedirs(models_path) early_stopping = EarlyStopping(patience=20, verbose=True, path=model_name) lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) test_train_sentences = loader.load_sentences(opts.test_train, lower, zeros) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) update_tag_scheme(test_train_sentences, tag_scheme)
def train_model(model, criterion, optimizer, scheduler, num_epochs=25): global_info = [] since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 early_stopping = EarlyStopping(patience=25, verbose=True) for epoch in range(num_epochs): local_info = [] print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode if epoch >0: scheduler.step(val_loss) running_loss = 0.0 running_corrects = 0 # Iterate over data. for inputs, labels in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / dataset_sizes[phase] if phase == 'val': val_loss = running_loss / dataset_sizes['val'] epoch_acc = running_corrects.double() / dataset_sizes[phase] #(Variable(x).data).cpu().numpy() if phase == 'train': local_info.append(epoch_loss) ea = epoch_acc.cpu().numpy() local_info.append(ea) else: local_info.append(epoch_loss) ea = epoch_acc.cpu().numpy() local_info.append(ea) print('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) lr_get = get_lr(optimizer) print("Current learning rate : {:.8f}".format(lr_get)) global_info.append(local_info) if phase =='val': early_stopping(epoch_loss, model) if early_stopping.early_stop: print("Early stopping") break time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) data = pd.DataFrame(global_info, columns = ['train_loss', 'train_acc', 'val_loss', 'val_acc']) data.to_csv('./csv_save/googlenet_raf.csv', header=True, index=True) return model
gradient_clipping_value = 0 var_len = True lstm = LSTM(dataset, input_size, hidden_size, num_layers, batch_size=batch_size, dropout=dropout) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate) valid_acc_history = [] use_cuda = False early_stopping = EarlyStopping(patience=3, verbose=True, delta=0) for epoch in range(num_epochs): print('Epoch:', epoch) train_loss_avg = 0 idx = np.array(np.random.permutation(range(Ntrain))) idx_torch = torch.LongTensor(idx) train_data = torch.index_select(train_data, 0, idx_torch) train_labels = torch.index_select(train_labels, 0, idx_torch) Lentrain = Lentrain[idx] for i in range(int(np.ceil(Ntrain // batch_size))): if (batch_size * (i + 1)) <= Ntrain:
def train_model(model, batch_size, patience, n_epochs): # to track the training loss as the model trains train_losses = [] # to track the validation loss as the model trains valid_losses = [] # to track the average training loss per epoch as the model trains avg_train_losses = [] # to track the average validation loss per epoch as the model trains avg_valid_losses = [] # initialize the early_stopping object early_stopping = EarlyStopping(patience=patience, verbose=True) for epoch in range(1, n_epochs + 1): ################### # train the model # ################### model.train() # prep model for training hidden = model.init_hidden(batch_size) for batch, (data, target) in enumerate(train_loader): # clear the gradients of all optimized variables optimizer.zero_grad() print('data', data) print('tar', target) # forward pass: compute predicted outputs by passing inputs to the model # output, hidden = model(Variable(data).float(), hidden) output, hidden = model(Variable(data).float(), hidden) # calculate the loss loss = criterion(output, Variable(target).view(-1)) # backward pass: compute gradient of the loss with respect to model parameters loss.backward(retain_graph=True) # perform a single optimization step (parameter update) optimizer.step() # record training loss train_losses.append(loss.item()) ###################### # validate the model # ###################### model.eval() # prep model for evaluation for data, target in valid_loader: # forward pass: compute predicted outputs by passing inputs to the model output, hidden = model(Variable(data).float(), hidden) # calculate the loss loss = criterion(output, Variable(target).view(-1)) # record validation loss valid_losses.append(loss.item()) # print training/validation statistics # calculate average loss over an epoch train_loss = np.average(train_losses) valid_loss = np.average(valid_losses) avg_train_losses.append(train_loss) avg_valid_losses.append(valid_loss) epoch_len = len(str(n_epochs)) print_msg = (f'[{epoch:>{epoch_len}}/{n_epochs:>{epoch_len}}] ' + f'train_loss: {train_loss:.5f} ' + f'valid_loss: {valid_loss:.5f}') print(print_msg) file.write(print_msg) file.write('***************') # clear lists to track next epoch train_losses = [] valid_losses = [] # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping") break # load the last checkpoint with the best model model.load_state_dict(torch.load('checkpoint.pt')) return model, avg_train_losses, avg_valid_losses
def main(args): dataset_name = args.dataset model_name = args.model n_inner_iter = args.adaptation_steps batch_size = args.batch_size save_model_file = args.save_model_file load_model_file = args.load_model_file lower_trial = args.lower_trial upper_trial = args.upper_trial is_test = args.is_test stopping_patience = args.stopping_patience epochs = args.epochs fast_lr = args.learning_rate slow_lr = args.meta_learning_rate noise_level = args.noise_level noise_type = args.noise_type resume = args.resume first_order = False inner_loop_grad_clip = 20 task_size = 50 output_dim = 1 checkpoint_freq = 10 horizon = 10 ##test meta_info = { "POLLUTION": [5, 50, 14], "HR": [32, 50, 13], "BATTERY": [20, 50, 3] } assert model_name in ("FCN", "LSTM"), "Model was not correctly specified" assert dataset_name in ("POLLUTION", "HR", "BATTERY") window_size, task_size, input_dim = meta_info[dataset_name] grid = [0., noise_level] output_directory = "output/" train_data_ML = pickle.load( open( "../../Data/TRAIN-" + dataset_name + "-W" + str(window_size) + "-T" + str(task_size) + "-ML.pickle", "rb")) validation_data_ML = pickle.load( open( "../../Data/VAL-" + dataset_name + "-W" + str(window_size) + "-T" + str(task_size) + "-ML.pickle", "rb")) test_data_ML = pickle.load( open( "../../Data/TEST-" + dataset_name + "-W" + str(window_size) + "-T" + str(task_size) + "-ML.pickle", "rb")) for trial in range(lower_trial, upper_trial): output_directory = "../../Models/" + dataset_name + "_" + model_name + "_MAML/" + str( trial) + "/" save_model_file_ = output_directory + save_model_file save_model_file_encoder = output_directory + "encoder_" + save_model_file load_model_file_ = output_directory + load_model_file checkpoint_file = output_directory + "checkpoint_" + save_model_file.split( ".")[0] try: os.mkdir(output_directory) except OSError as error: print(error) with open(output_directory + "/results2.txt", "a+") as f: f.write("Learning rate :%f \n" % fast_lr) f.write("Meta-learning rate: %f \n" % slow_lr) f.write("Adaptation steps: %f \n" % n_inner_iter) f.write("Noise level: %f \n" % noise_level) if model_name == "LSTM": model = LSTMModel(batch_size=batch_size, seq_len=window_size, input_dim=input_dim, n_layers=2, hidden_dim=120, output_dim=output_dim) model2 = LinearModel(120, 1) optimizer = torch.optim.Adam(list(model.parameters()) + list(model2.parameters()), lr=slow_lr) loss_func = mae #loss_func = nn.SmoothL1Loss() #loss_func = nn.MSELoss() initial_epoch = 0 #torch.backends.cudnn.enabled = False device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") meta_learner = MetaLearner(model2, optimizer, fast_lr, loss_func, first_order, n_inner_iter, inner_loop_grad_clip, device) model.to(device) early_stopping = EarlyStopping(patience=stopping_patience, model_file=save_model_file_encoder, verbose=True) early_stopping2 = EarlyStopping(patience=stopping_patience, model_file=save_model_file_, verbose=True) if resume: checkpoint = torch.load(checkpoint_file) model.load_state_dict(checkpoint["model"]) meta_learner.load_state_dict(checkpoint["meta_learner"]) initial_epoch = checkpoint["epoch"] best_score = checkpoint["best_score"] counter = checkpoint["counter_stopping"] early_stopping.best_score = best_score early_stopping2.best_score = best_score early_stopping.counter = counter early_stopping2.counter = counter total_tasks, task_size, window_size, input_dim = train_data_ML.x.shape accum_mean = 0.0 for epoch in range(initial_epoch, epochs): model.zero_grad() meta_learner._model.zero_grad() #train batch_idx = np.random.randint(0, total_tasks - 1, batch_size) #for batch_idx in range(0, total_tasks-1, batch_size): x_spt, y_spt = train_data_ML[batch_idx] x_qry, y_qry = train_data_ML[batch_idx + 1] x_spt, y_spt = to_torch(x_spt), to_torch(y_spt) x_qry = to_torch(x_qry) y_qry = to_torch(y_qry) # data augmentation epsilon = grid[np.random.randint(0, len(grid))] if noise_type == "additive": y_spt = y_spt + epsilon y_qry = y_qry + epsilon else: y_spt = y_spt * (1 + epsilon) y_qry = y_qry * (1 + epsilon) train_tasks = [ Task(model.encoder(x_spt[i]), y_spt[i]) for i in range(x_spt.shape[0]) ] val_tasks = [ Task(model.encoder(x_qry[i]), y_qry[i]) for i in range(x_qry.shape[0]) ] adapted_params = meta_learner.adapt(train_tasks) mean_loss = meta_learner.step(adapted_params, val_tasks, is_training=True) #accum_mean += mean_loss.cpu().detach().numpy() #progressBar(batch_idx, total_tasks, 100) #print(accum_mean/(batch_idx+1)) #test val_error = test(validation_data_ML, meta_learner, model, device, noise_level) test_error = test(test_data_ML, meta_learner, model, device, 0.0) print("Epoch:", epoch) print("Val error:", val_error) print("Test error:", test_error) early_stopping(val_error, model) early_stopping2(val_error, meta_learner) #checkpointing if epochs % checkpoint_freq == 0: torch.save( { "epoch": epoch, "model": model.state_dict(), "meta_learner": meta_learner.state_dict(), "best_score": early_stopping2.best_score, "counter_stopping": early_stopping2.counter }, checkpoint_file) if early_stopping.early_stop: print("Early stopping") break print("hallo") model.load_state_dict(torch.load(save_model_file_encoder)) model2.load_state_dict( torch.load(save_model_file_)["model_state_dict"]) meta_learner = MetaLearner(model2, optimizer, fast_lr, loss_func, first_order, n_inner_iter, inner_loop_grad_clip, device) validation_error = test(validation_data_ML, meta_learner, model, device, noise_level=0.0) test_error = test(test_data_ML, meta_learner, model, device, noise_level=0.0) validation_error_h1 = test(validation_data_ML, meta_learner, model, device, noise_level=0.0, horizon=1) test_error_h1 = test(test_data_ML, meta_learner, model, device, noise_level=0.0, horizon=1) model.load_state_dict(torch.load(save_model_file_encoder)) model2.load_state_dict( torch.load(save_model_file_)["model_state_dict"]) meta_learner2 = MetaLearner(model2, optimizer, fast_lr, loss_func, first_order, 0, inner_loop_grad_clip, device) validation_error_h0 = test(validation_data_ML, meta_learner2, model, device, noise_level=0.0, horizon=1) test_error_h0 = test(test_data_ML, meta_learner2, model, device, noise_level=0.0, horizon=1) model.load_state_dict(torch.load(save_model_file_encoder)) model2.load_state_dict( torch.load(save_model_file_)["model_state_dict"]) meta_learner2 = MetaLearner(model2, optimizer, fast_lr, loss_func, first_order, n_inner_iter, inner_loop_grad_clip, device) validation_error_mae = test(validation_data_ML, meta_learner2, model, device, 0.0) test_error_mae = test(test_data_ML, meta_learner2, model, device, 0.0) print("test_error_mae", test_error_mae) with open(output_directory + "/results2.txt", "a+") as f: f.write("Test error: %f \n" % test_error) f.write("Validation error: %f \n" % validation_error) f.write("Test error h1: %f \n" % test_error_h1) f.write("Validation error h1: %f \n" % validation_error_h1) f.write("Test error h0: %f \n" % test_error_h0) f.write("Validation error h0: %f \n" % validation_error_h0) f.write("Test error mae: %f \n" % test_error_mae) f.write("Validation error mae: %f \n" % validation_error_mae) print(test_error) print(validation_error)
def train(args): torch.manual_seed(123) torch.cuda.manual_seed(123) np.random.seed(123) random.seed(123) torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True console = Console() opt1 = optim.AdamW( params=args.model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, ) if args.use_count: pass else: # criterion1 = nn.BCEWithLogitsLoss(reduction='mean') # input: logit, target \in {0, 1}. criterion1 = nn.MSELoss(reduction='mean') writer = SummaryWriter(f'./runs/{args.experiment}') early_stopping = EarlyStopping(patience=10, verbose=False, path=f'./parameter/{args.experiment}.pth') steps_per_epoch = len(args.train_loader) for epoch in range(1, args.epochs + 1): total_loss = 0 args.model.train() with tqdm(total=steps_per_epoch, leave=False, dynamic_ncols=True) as pbar: for i, batch in enumerate(args.train_loader): x = batch['input'].to(args.device) y = batch['target'].to(args.device) opt1.zero_grad() pred = args.model(x) recon_loss = criterion1(pred, y) recon_loss.backward() opt1.step() # opt2.zero_grad() # pred, z_fake = args.model(x) # z_real = Normal(loc = torch.zeros_like(z_fake), scale=1).sample() # c_fake_loss = criterion2(args.model.C(z_fake), y_fake) # c_real_loss = criterion2(args.model.C(z_real), y_real) # c_loss = 0.5 * (c_fake_loss + c_real_loss) # c_loss.backward() # nn.utils.clip_grad_norm_(args.model.C.parameters(), 1.) # opt2.step() pbar.update(1) # train_G_adv_loss += g_adv_loss.item() # train_G_recon_loss += recon_loss.item() # train_C_fake_loss += c_fake_loss.item() # train_C_real_loss += c_real_loss.item() total_loss += recon_loss # avg_adv_loss = train_G_adv_loss / steps_per_epoch # avg_recon_loss = train_G_recon_loss / steps_per_epoch avg_recon_loss = total_loss / steps_per_epoch # avg_fake_loss = train_C_fake_loss / steps_per_epoch # avg_real_loss = train_C_real_loss / steps_per_epoch # early_stopping(avg_recon_loss, args.model) early_stopping(avg_recon_loss, args.model) if early_stopping.early_stop: print('Early stopping') break console.print(f"Train [{epoch:>04}]/[{args.epochs:>04}]: ", end='', style="Bold Cyan") # console.print(f"adv_loss:{avg_adv_loss:.4f}", sep=' | ', style='Bold Blue') console.print(f"recon_loss:{avg_recon_loss:.4f}", sep=' | ', style='Bold Blue') # console.print(f"fake_loss:{avg_fake_loss:.4f}", sep=' | ', style='Bold Blue') # console.print(f"real_loss:{avg_real_loss:.4f}", sep=' | ', style='Bold Blue') # writer.add_scalar(tag='adv_loss', scalar_value=avg_adv_loss, global_step=epoch) writer.add_scalar(tag='recon_loss', scalar_value=avg_recon_loss, global_step=epoch) # writer.add_scalar(tag='fake_loss', scalar_value=avg_fake_loss, global_step=epoch) # writer.add_scalar(tag='real_loss', scalar_value=avg_real_loss, global_step=epoch) if epoch % 10 == 0: torch.save( args.model.state_dict(), os.path.join("D:\프로젝트\메타플레이\jinsoo\parameter", f"{args.experiment}_epoch_{epoch:04d}.pt"))