def main(): # get dataset train_dataset = get_dataset( root="/media/fanyang/workspace/DataSet/MARS/bbox_train", txt_file=os.path.join(Proj_Dir, 'data/train.txt')) val_dataset = get_dataset( root="/media/fanyang/workspace/DataSet/MARS/bbox_train", txt_file=os.path.join(Proj_Dir, 'data/test.txt')) # prepare model resnet = resnet50(pretrained=False) # two criterion resnet.criterion = nn.CrossEntropyLoss() resnet.criterion2 = SiameseLoss() resnet.fc = nn.Linear(in_features=2048, out_features=625) resnet.load_state_dict_diy( torch.load( "/home/fanyang/PycharmProjects/PersonReID_CL/ckpt/no-cl-model-resnet50/last-layer-finetuned-model.pkl" )) # resnet.optimizer = optim.Adam(params=resnet.fc.parameters(), lr=1e-3) resnet.optimizer = optim.SGD(params=resnet.parameters(), lr=5e-4) resnet.cuda() # for record the train process writer_dir = "ckpt/siamese-model-resnet50" saver_dir = writer_dir writer = SummaryWriter(log_dir=os.path.join(Proj_Dir, writer_dir)) routine = Routine2Criteion(model=resnet, saver_dir=saver_dir, writer=writer) while True: train_loader = get_loader(dataset=train_dataset, batch_size=BATCH_SIZE) val_loader = get_loader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False) routine.train_one_epoch(loader=train_loader, record_n_times_per_epoch=400) # adjust the learning per epoch adjust_learning_rate(resnet.optimizer) routine.validation(loader=val_loader)
def main(): # get dataset train_dataset = get_dataset( root="/media/fanyang/workspace/DataSet/MARS/bbox_train", txt_file=os.path.join(Proj_Dir, 'data/train.txt')) val_dataset = get_dataset( root="/media/fanyang/workspace/DataSet/MARS/bbox_train", txt_file=os.path.join(Proj_Dir, 'data/test.txt')) # prepare model model = Inceptionv2() # two criterion model.criterion = nn.CrossEntropyLoss() model.criterion2 = CenterLoss(feature_len=1024, num_classes=625) # model.load_state_dict( # torch.load( # "/home/fanyang/PycharmProjects/PersonReID_CL/ckpt/no-cl-model-resnet50/last-layer-finetuned-model.pkl")) model.cuda() model.optimizer = optim.Adam(params=model.parameters(), lr=1e-3) # model.optimizer = optim.SGD(params=model.parameters(), lr=1e-3) # for record the train process writer_dir = "ckpt/cl-model-inceptionv2" saver_dir = writer_dir writer = SummaryWriter(log_dir=os.path.join(Proj_Dir, writer_dir)) routine = Routine2Criteion(model=model, saver_dir=saver_dir, writer=writer) while True: train_loader = get_loader(dataset=train_dataset, batch_size=BATCH_SIZE) val_loader = get_loader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False) # print(type(batch_data)) # print(type(batch_label)) # exit() routine.train_one_epoch(loader=train_loader, record_n_times_per_epoch=400) # adjust the learning per epoch adjust_learning_rate(model.optimizer) routine.validation(loader=val_loader)
def main(): root = "/home/fanyang/PycharmProjects/SignLanguage/data/tctodd" id_sign, sign_id = id_2_sign(root) train_samples, dev_samples = load_dataset(root, sign_id, hold_id=8) train_dataset = SignLangDataset(sample_list=train_samples) dev_dateset = SignLangDataset(sample_list=dev_samples) # model model = SimpleNN1DCNN() model.load_state_dict( torch.load('/home/fanyang/PycharmProjects/SignLanguage/ckpt/model.pkl')) model.cuda() for param in model.parameters(): param.requires_grad = False vis = visdom.Visdom() min_val, max_val = get_min_max_from_dataset(train_dataset) # min: -0.4535120129585266, max: 1.0 print("min:{}, max:{}".format(min_val, max_val)) # exit() vis.heatmap(X=train_dataset[0][0], opts={'xmin': min_val, 'xmax': max_val}) # [22, 90] label :0 inputs = Variable(torch.randn(1, 1, 22, 90), requires_grad=True).cuda() nn.init.normal(inputs, std=.001) inputs._grad_fn = None model.optimizer = optim.SGD(params=[inputs], lr=1e-4) for i in count(): logits = model(inputs) logits = torch.squeeze(logits) model.optimizer.zero_grad() logits[0].backward(torch.FloatTensor([-1.]).cuda()) model.optimizer.step() if (i + 1) % 50000 == 0: vis.heatmap(X=torch.squeeze(inputs).cpu().data, opts={'xmin': min_val, 'xmax': max_val, 'title': '%d-step' % (i + 1)}) print("step:%d" % (i + 1), "prob:%.7f" % F.softmax(logits)[0].cpu().data.numpy()[0]) adjust_learning_rate(model.optimizer)
def main(): # get dataset train_dataset = get_dataset( root="/media/fanyang/workspace/DataSet/MARS/bbox_train", txt_file=os.path.join(Proj_Dir, 'data/train.txt')) val_dataset = get_dataset( root="/media/fanyang/workspace/DataSet/MARS/bbox_train", txt_file=os.path.join(Proj_Dir, 'data/test.txt')) # prepare model inceptionv2 = Inceptionv2() inceptionv2.criterion = nn.CrossEntropyLoss() inceptionv2.cuda() # inceptionv2.load_state_dict( # torch.load( # "/home/fanyang/PycharmProjects/PersonReID_CL/ckpt/model-inceptionv3-transform-input.pkl")) inceptionv2.optimizer = optim.Adam(params=inceptionv2.parameters(), lr=1e-3) # inception.optimizer = optim.SGD(params=inception.parameters(), lr=5e-4) # for record the train process writer_dir = "ckpt/no-cl-model-inceptionv2" saver_dir = writer_dir writer = SummaryWriter(log_dir=os.path.join(Proj_Dir, writer_dir)) routine = Routine(model=inceptionv2, saver_dir=saver_dir, writer=writer) while True: train_loader = get_loader(dataset=train_dataset, batch_size=BATCH_SIZE) val_loader = get_loader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False) routine.train_one_epoch(loader=train_loader, record_n_times_per_epoch=400) # adjust the learning per epoch adjust_learning_rate(inceptionv2.optimizer) routine.validation(loader=val_loader)
def main(): root = "/home/fanyang/PycharmProjects/SignLanguage/data/tctodd" id_sign, sign_id = id_2_sign(root) for i in range(9): train_samples, dev_samples = load_dataset(root, sign_id, hold_id=i) train_dataset = SignLangDataset(sample_list=train_samples) dev_dateset = SignLangDataset(sample_list=dev_samples) # model model = SimpleNN1DCNN() model.cuda() model.criterion = nn.CrossEntropyLoss() model.optimizer = optim.Adam(params=model.parameters()) writer = SummaryWriter('ckpt/hold-id-%d' % i) routine = Routine(model=model, saver_dir='ckpt/hold-id-%d' % i, writer=writer) for i in count(): train_loader = SignLangDataLoader(dataset=train_dataset, batch_size=1, shuffle=True) dev_loader = SignLangDataLoader(dataset=dev_dateset, batch_size=1, shuffle=True) routine.train_one_epoch(train_loader) tools.adjust_learning_rate(model.optimizer) routine.validation(dev_loader) if i == 100: break
def main(): root = "/home/fanyang/PycharmProjects/SignLanguage/data/robotfailuer" file_name = 'lp1.data.txt' file_path = os.path.join(root, file_name) dataset = load_data_robot(file_name=file_path) print("num data in dataset ", len(dataset)) for i in range(5): train_data, dev_data = k_fold(dataset=dataset, bin_id=i) train_batch, train_label = train_data dev_batch, dev_label = dev_data train_batch, dev_batch = normalize_data(train_batch, dev_batch) # model model = SimpleNN1DCNNRobot(num_classes=4) model.cuda() model.criterion = nn.CrossEntropyLoss() # using sgd instead of Adam model.optimizer = optim.SGD(params=model.parameters(), lr=1e-4) train_batch = Variable(train_batch).cuda() train_label = Variable(train_label).cuda() dev_batch = Variable(dev_batch, volatile=True).cuda() dev_label = Variable(dev_label, volatile=True).cuda() # print(train_label.size(), train_batch.size()) # print(dev_label.size(), dev_batch.size()) # exit() # prepare writer writer_dir = 'ckpt/robot-1-bin-id-%d' % i saver_dir = writer_dir writer = SummaryWriter(writer_dir) for i in count(): model.train() logits = model(train_batch, writer) loss = model.criterion(logits, train_label) model.optimizer.zero_grad() loss.backward() model.optimizer.step() print("epoch:{}, loss:{}".format(i, loss.cpu().data.numpy()[0])) writer.add_scalar('train/loss', loss.cpu().data.numpy(), global_step=i) writer.add_scalar('train/accu', tools.accuracy(logits, train_label).cpu().data.numpy(), global_step=i) torch.save(model.state_dict(), os.path.join(saver_dir, 'record-step-%d-model.pkl' % i)) # just save the latest 5 parameters checkpoints if i >= 5: os.remove( os.path.join(saver_dir, 'record-step-%d-model.pkl' % i)) tools.adjust_learning_rate(model.optimizer) # switch the model to the evaluation mode model.eval() logits = model(dev_batch) loss = model.criterion(logits, dev_label) writer.add_scalar('val/loss', loss.cpu().data.numpy(), global_step=i) writer.add_scalar('val/accu', tools.accuracy(logits, dev_label).cpu().data.numpy(), global_step=i)
def train(self, ii, logger): train_data, train_loader = self._get_data(flag='train') vali_data, vali_loader = self._get_data(flag='val') next_data, next_loader = self._get_data(flag='train') test_data, test_loader = self._get_data(flag='test') if self.args.rank == 1: train_data, train_loader = self._get_data(flag='train') path = os.path.join(self.args.path, str(ii)) try: os.mkdir(path) except FileExistsError: pass time_now = time.time() train_steps = len(train_loader) early_stopping = EarlyStopping(patience=self.args.patience, verbose=True, rank=self.args.rank) W_optim, A_optim = self._select_optimizer() criterion = self._select_criterion() if self.args.use_amp: scaler = torch.cuda.amp.GradScaler() for epoch in range(self.args.train_epochs): iter_count = 0 train_loss = [] rate_counter = AverageMeter() Ag_counter, A_counter, Wg_counter, W_counter = AverageMeter( ), AverageMeter(), AverageMeter(), AverageMeter() self.model.train() epoch_time = time.time() for i, (trn_data, val_data, next_data) in enumerate( zip(train_loader, vali_loader, next_loader)): for i in range(len(trn_data)): trn_data[i], val_data[i], next_data[i] = trn_data[i].float( ).to(self.device), val_data[i].float().to( self.device), next_data[i].float().to(self.device) iter_count += 1 A_optim.zero_grad() rate = self.arch.unrolled_backward( self.args, trn_data, val_data, next_data, W_optim.param_groups[0]['lr'], W_optim) rate_counter.update(rate) # for r in range(1, self.args.world_size): # for n, h in self.model.named_H(): # if "proj.{}".format(r) in n: # if self.args.rank <= r: # with torch.no_grad(): # dist.all_reduce(h.grad) # h.grad *= self.args.world_size/r+1 # else: # z = torch.zeros(h.shape).to(self.device) # dist.all_reduce(z) for a in self.model.A(): with torch.no_grad(): dist.all_reduce(a.grad) a_g_norm = 0 a_norm = 0 n = 0 for a in self.model.A(): a_g_norm += a.grad.mean() a_norm += a.mean() n += 1 Ag_counter.update(a_g_norm / n) A_counter.update(a_norm / n) A_optim.step() W_optim.zero_grad() pred, true = self._process_one_batch(train_data, trn_data) loss = criterion(pred, true) train_loss.append(loss.item()) if (i + 1) % 100 == 0: logger.info( "\tR{0} iters: {1}, epoch: {2} | loss: {3:.7f}".format( self.args.rank, i + 1, epoch + 1, loss.item())) speed = (time.time() - time_now) / iter_count left_time = speed * ( (self.args.train_epochs - epoch) * train_steps - i) logger.info( '\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format( speed, left_time)) iter_count = 0 time_now = time.time() if self.args.use_amp: scaler.scale(loss).backward() scaler.step(W_optim) scaler.update() else: loss.backward() w_g_norm = 0 w_norm = 0 n = 0 for w in self.model.W(): w_g_norm += w.grad.mean() w_norm += w.mean() n += 1 Wg_counter.update(w_g_norm / n) W_counter.update(w_norm / n) W_optim.step() logger.info("R{} Epoch: {} W:{} Wg:{} A:{} Ag:{} rate{}".format( self.args.rank, epoch + 1, W_counter.avg, Wg_counter.avg, A_counter.avg, Ag_counter.avg, rate_counter.avg)) logger.info("R{} Epoch: {} cost time: {}".format( self.args.rank, epoch + 1, time.time() - epoch_time)) train_loss = np.average(train_loss) vali_loss = self.vali(vali_data, vali_loader, criterion) test_loss = self.vali(test_data, test_loader, criterion) logger.info( "R{0} Epoch: {1}, Steps: {2} | Train Loss: {3:.7f} Vali Loss: {4:.7f} Test Loss: {5:.7f}" .format(self.args.rank, epoch + 1, train_steps, train_loss, vali_loss, test_loss)) early_stopping(vali_loss, self.model, path) flag = torch.tensor( [1]) if early_stopping.early_stop else torch.tensor([0]) flag = flag.to(self.device) flags = [ torch.tensor([1]).to(self.device), torch.tensor([1]).to(self.device) ] dist.all_gather(flags, flag) if flags[0].item() == 1 and flags[1].item() == 1: logger.info("Early stopping") break adjust_learning_rate(W_optim, epoch + 1, self.args) best_model_path = path + '/' + '{}_checkpoint.pth'.format( self.args.rank) self.model.load_state_dict(torch.load(best_model_path)) return self.model
def train(self, setting): train_data, train_loader = self._get_data(flag = 'train') vali_data, vali_loader = self._get_data(flag = 'val') test_data, test_loader = self._get_data(flag = 'test') path = os.path.join(self.args.checkpoints, setting) if not os.path.exists(path): os.makedirs(path) time_now = time.time() train_steps = len(train_loader) early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) model_optim = self._select_optimizer() criterion = self._select_criterion() if self.args.use_amp: scaler = torch.cuda.amp.GradScaler() for epoch in range(self.args.train_epochs): iter_count = 0 train_loss = [] self.model.train() epoch_time = time.time() for i, (batch_x,batch_y,batch_x_mark,batch_y_mark) in enumerate(train_loader): iter_count += 1 model_optim.zero_grad() pred, true = self._process_one_batch( train_data, batch_x, batch_y, batch_x_mark, batch_y_mark) loss = criterion(pred, true) train_loss.append(loss.item()) if (i+1) % 100==0: print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item())) speed = (time.time()-time_now)/iter_count left_time = speed*((self.args.train_epochs - epoch)*train_steps - i) print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time)) iter_count = 0 time_now = time.time() if self.args.use_amp: scaler.scale(loss).backward() scaler.step(model_optim) scaler.update() else: loss.backward() model_optim.step() print("Epoch: {} cost time: {}".format(epoch+1, time.time()-epoch_time)) train_loss = np.average(train_loss) vali_loss = self.vali(vali_data, vali_loader, criterion) test_loss = self.vali(test_data, test_loader, criterion) print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format( epoch + 1, train_steps, train_loss, vali_loss, test_loss)) early_stopping(vali_loss, self.model, path) if early_stopping.early_stop: print("Early stopping") break adjust_learning_rate(model_optim, epoch+1, self.args) best_model_path = path+'/'+'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) return self.model
def train(self, setting): # データを取得, pytorchのライブラリを活用 # data_set, data_loader train_data, train_loader = self._get_data(flag='train') vali_data, vali_loader = self._get_data(flag='val') # val?? test_data, test_loader = self._get_data(flag='test') path = os.path.join(self.args.checkpoints, setting) if not os.path.exists(path): os.makedirs(path) time_now = time.time() train_steps = len(train_loader) early_stopping = EarlyStopping( patience=self.args.patience, verbose=True) model_optim = self._select_optimizer() criterion = self._select_criterion() # lossの計算方法 if self.args.use_amp: scaler = torch.cuda.amp.GradScaler() for epoch in range(self.args.train_epochs): # epoch 初期値は 6 iter_count = 0 train_loss = [] train_loss_avg_list = [] self.model.train() # 1. modelのtrainを呼び出す epoch_time = time.time() # データローダをfor inで回すことによって扱いやすくなる for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(tqdm(train_loader)): # print("Shape of batch_x") # print(batch_x.shape) iter_count += 1 model_optim.zero_grad() # 勾配の初期化 # 学習時は model.eval()を呼ばない # ここからが本質 xとyが何者なのか pred, true = self._process_one_batch( train_data, batch_x, batch_y, batch_x_mark, batch_y_mark) # 現在の出力と正しい値 if self.args.interpret is True: # 高いattentionをmaskしたときのpred mask_attention_pred, true = self._process_one_batch( train_data, batch_x, batch_y, batch_x_mark, batch_y_mark) # 現在の出力と正しい値 # mask_attention_output loss = criterion(pred, true, mask_attention_pred) else: loss = criterion(pred, true) # 誤差計算 train_loss.append(loss.item()) if (i+1) % 100 == 0: print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format( i + 1, epoch + 1, loss.item())) speed = (time.time()-time_now)/iter_count left_time = speed * \ ((self.args.train_epochs - epoch)*train_steps - i) print( '\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time)) iter_count = 0 time_now = time.time() if self.args.use_amp: scaler.scale(loss).backward() scaler.step(model_optim) scaler.update() else: loss.backward() # 誤差逆伝搬 model_optim.step() # 更新 # loss のデータをsaveしたい print("Epoch: {} cost time: {}".format( epoch+1, time.time()-epoch_time)) train_loss_avg = np.average(train_loss) train_loss_avg_list.append(train_loss_avg) vali_loss = self.vali(vali_data, vali_loader, criterion) test_loss = self.vali(test_data, test_loader, criterion) print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format( epoch + 1, train_steps, train_loss_avg, vali_loss, test_loss)) early_stopping(vali_loss, self.model, path) if early_stopping.early_stop: print("Early stopping") break adjust_learning_rate(model_optim, epoch+1, self.args) # line notify if self.args.notify: send_line_notify(message="Epoch: {} cost time: {}".format( epoch+1, time.time()-epoch_time)+"Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format( epoch + 1, train_steps, train_loss_avg, vali_loss, test_loss)) best_model_path = path+'/'+'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) # いつセーブした? folder_path = './results/' + setting + '/' if not os.path.exists(folder_path): os.makedirs(folder_path) # loss のsave np.save(folder_path+'/'+'train_loss_avg_list.npy', train_loss_avg_list) return self.model
def train(self, setting): train_data, train_loader = self._get_data(flag='train') valid_data, valid_loader = self._get_data(flag='val') print(f'number of batches in train data={len(train_loader)}') print(f'number of batches in valid data={len(valid_loader)}') path = './checkpoints/'+setting if not os.path.exists(path): os.makedirs(path) time_now = time.time() train_steps = len(train_loader) early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) model_optim = self._select_optimizer() criterion = self._select_criterion(self.args.data) # print(self.model) best_utility = 0 for epoch in range(self.args.train_epochs): iter_count = 0 train_loss = [] train_auc = [] self.model.train() # batch_x: (batch_size, seq_len, n_features) # batch_y: (batch_size, label_len + pred_len, n_features) # batch_x_mark: (batch_size, seq_len) # batch_y_mark: (batch_size, label_len + pred_len) for i, (s_begin, s_end, r_begin, r_end, batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader): # for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader): # print(f'{i} s_begin: ', s_begin) # print(f'{i} s_end: ', s_end) # print(f'{i} r_begin: ', r_begin) # print(f'{i} r_end: ',r_end) iter_count += 1 # print(f'x : {batch_x}') # print(f'y : {batch_y}') # print(f'x_mark : {batch_x_mark}') # print(f'y_mark : {batch_y_mark}') model_optim.zero_grad() batch_x = batch_x.float().to(self.device) batch_y = batch_y.float() batch_x_mark = batch_x_mark.float().to(self.device) batch_y_mark = batch_y_mark.float().to(self.device) # decoder input dec_inp = torch.zeros_like(batch_y[:,-self.args.pred_len:,:]).float() dec_inp = torch.cat([batch_y[:,:self.args.label_len,:], dec_inp], dim=1).float().to(self.device) # encoder - decoder if self.args.output_attention: y_pred = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] else: y_pred = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) f_dim = -1 if self.args.features=='MS' else 0 y_true = batch_y[:,-self.args.pred_len:,-self.args.c_out:].to(self.device) # y_true = batch_y[:,-self.args.pred_len:,f_dim:].to(self.device) loss = criterion(y_pred, y_true) loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) model_optim.step() # print(y_pred) y_pred = np.where(y_pred.sigmoid().detach().cpu().numpy() >= 0.5, 1, 0).astype(int) y_true = np.where(y_true.sigmoid().detach().cpu().numpy() >= 0.5, 1, 0).astype(int) # y_true = y_true.detach().cpu().numpy().astype(int) # print('y_true: ', np.median(y_true, axis=1)) # print('y_pred: ', np.median(y_pred, axis=1)) train_auc.append(roc_auc_score(np.median(y_true, axis=1), np.median(y_pred, axis=1))) loss = loss.item() train_loss.append(loss) if (i+1) % 100 == 0: print(f'\ttrain_iters={i+1} | epoch={epoch+1} | ' \ f'batch_loss={loss:.4f} | running_loss={np.mean(train_loss):.4f} | running_auc={np.mean(train_auc):.4f}') speed = (time.time()-time_now)/iter_count left_time = speed*((self.args.train_epochs - epoch)*train_steps - i) print(f'\tspeed={speed:.4f}s/batch | left_time={left_time:.4f}s') # print(torch.cuda.memory_summary(abbreviated=True)) iter_count = 0 time_now = time.time() del batch_x, batch_x_mark, batch_y, batch_y_mark, dec_inp, y_true, y_pred gc.collect() torch.cuda.empty_cache() torch.cuda.reset_max_memory_allocated(self.device) returns = self.evaluate(valid_data, valid_loader, criterion) valid_loss, valid_preds, valid_trues, v_start, v_end = returns # print(valid_data.data_x[b_start:b_end, -self.args.c_out:].shape) # print('before where: ', valid_preds) # print(valid_data.data_x[b_start:b_end, -self.args.c_out:]) # valid_trues = valid_data.data_x[b_start:b_end, -self.args.c_out:] # print('valid_preds.shape: ', valid_preds.shape) valid_preds = np.median(valid_preds, axis=1) print(pd.DataFrame(valid_preds).describe()) valid_preds = np.where(valid_preds >= 0.5, 1, 0).astype(int) # print('after where: ', valid_preds) # valid_trues = 1/(1+np.exp(-valid_trues)) valid_trues = np.median(valid_trues, axis=1) valid_trues = np.where(valid_trues >= 0.5, 1, 0).astype(int) # print('valid_trues shape: ', valid_trues.shape) # print('valid_trues: ', valid_trues) valid_auc = roc_auc_score(valid_trues, valid_preds) valid_u_score = utility_score_bincount(date=valid_data.data_stamp[v_start:v_end], weight=valid_data.weight[v_start:v_end], resp=valid_data.resp[v_start:v_end], action=valid_preds) max_u_score = utility_score_bincount(date=valid_data.data_stamp[v_start:v_end], weight=valid_data.weight[v_start:v_end], resp=valid_data.resp[v_start:v_end], action=valid_trues) best_utility = max(best_utility, valid_u_score) print(f'epoch={epoch+1} | ' \ f'average_train_loss={np.mean(train_loss):.4f} | average_valid_loss={valid_loss:.4f} | ' f'valid_utility={valid_u_score:.4f}/{max_u_score:.4f} | valid_auc={valid_auc:.4f}') early_stopping(valid_auc, self.model, path) if early_stopping.early_stop: print("Early stopping") print(f"Best utility score is {best_utility:.4f}") break adjust_learning_rate(model_optim, epoch+1, self.args) best_model_path = path+'/'+'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) return self.model
def train(self, setting): train_data, train_loader = self._get_data(flag='train') vali_data, vali_loader = self._get_data(flag='val') test_data, test_loader = self._get_data(flag='test') total_para, trainable_para = self._get_number_parameters() print('Total number of parameters: {:d}'.format(total_para)) print('Number of trainable parameters: {:d}'.format(trainable_para)) path = './checkpoints/' + setting if not os.path.exists(path): os.makedirs(path) time_now = time.time() train_steps = len(train_loader) early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) model_optim = self._select_optimizer() criterion = self._select_criterion() for epoch in range(self.args.train_epochs): iter_count = 0 train_loss = [] self.model.train() for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader): iter_count += 1 model_optim.zero_grad() batch_x = batch_x.double().to(self.device) batch_y = batch_y.double() batch_x_mark = batch_x_mark.double().to(self.device) batch_y_mark = batch_y_mark.double().to(self.device) # decoder input dec_inp = torch.zeros_like( batch_y[:, -self.args.pred_len:, :]).double() dec_inp = torch.cat( [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).double().to(self.device) # encoder - decoder outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) batch_y = batch_y[:, -self.args.pred_len:, :].to(self.device) loss = criterion(outputs, batch_y) train_loss.append(loss.item()) if (i + 1) % 100 == 0: print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format( i + 1, epoch + 1, loss.item())) speed = (time.time() - time_now) / iter_count left_time = speed * ( (self.args.train_epochs - epoch) * train_steps - i) print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format( speed, left_time)) iter_count = 0 time_now = time.time() loss.backward() model_optim.step() train_loss = np.average(train_loss) vali_loss = self.vali(vali_data, vali_loader, criterion) test_loss = self.vali(test_data, test_loader, criterion) print( "Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}" .format(epoch + 1, train_steps, train_loss, vali_loss, test_loss)) early_stopping(vali_loss, self.model, path) if early_stopping.early_stop: print("Early stopping") break adjust_learning_rate(model_optim, epoch + 1, self.args) best_model_path = path + '/' + 'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) return self.model
print(net_glob) net_glob.train() # training train_loader = DataLoader(dataset_train, batch_size=64, shuffle=True) optimizer = optim.SGD(net_glob.parameters(), lr=args.lr, momentum=args.momentum) list_loss = [] global_acc_tests = [] local_acc_tests = [] net_glob.train() for epoch in range(args.epochs): batch_loss = [] lr = args.lr * (args.decay_rate ** (epoch // args.per_epoch)) adjust_learning_rate(optimizer, lr) for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(args.device), target.to(args.device) optimizer.zero_grad() output = net_glob(data) loss = F.cross_entropy(output[-1], target) loss.backward() optimizer.step() batch_loss.append(loss.item()) loss_avg = sum(batch_loss)/len(batch_loss) print('\nTrain loss:', loss_avg) list_loss.append(loss_avg) # testing
def train(self, setting): print('prepare data...') train_data_loaders, vali_data_loaders, test_data_loaders = self._get_data( ) print('Number of data loaders:', len(train_data_loaders)) path = './checkpoints/' + setting if not os.path.exists(path): os.makedirs(path) time_now = time.time() early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) model_optim = self._select_optimizer() criterion = self._select_criterion() for epoch in range(self.args.train_epochs): iter_count = 0 train_loss = [] self.model.train() for index in range(len(train_data_loaders)): train_loader = train_data_loaders[index] train_loss = [] begin_ = time.time() for i, (batch_x, batch_y) in enumerate(train_loader): iter_count += 1 model_optim.zero_grad() batch_x = batch_x.double() # .to(self.device) batch_y = batch_y.double() outputs = self.model(batch_x).view(-1, 24) batch_y = batch_y[:, -self.args.pred_len:, -1].view(-1, 24) # .to(self.device) loss = criterion(outputs, batch_y) # + 0.1*corr train_loss.append(loss.item()) loss.backward() model_optim.step() print('INDEX Finished', index, 'train loss', np.average(train_loss), 'COST', time.time() - begin_) train_loss = np.average(train_loss) vali_loss, mae, score = self.test('1') early_stopping(-score, self.model, path) print( "Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} score: {4:.7f}" .format(epoch + 1, 0, np.average(train_loss), vali_loss, score)) if early_stopping.early_stop: print("Early stopping") break adjust_learning_rate(model_optim, epoch + 1, self.args) best_model_path = path + '/' + 'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) print('Model is saved at', best_model_path) self.model.eval() return self.model
def main(): # load parameter lr = conf_yolov3['lr'] momentum = conf_yolov3['momentum'] weight_decay = conf_yolov3['weight_decay'] batch_size = conf_yolov3['batch_size'] epoch = conf_yolov3['epoch'] gamma = conf_yolov3['gamma'] # load data # data_path = '/mnt/storage/project/data/VOCdevkit/VOC2007' data_path = '~/datasets/VOC/VOCdevkit/VOC2007' data_set = LoadVocDataSets(data_path, 'trainval', AnnotationTransform(), PreProcess(resize=(416, 416))) # define network. yolov3 = YOLOV3(conf_yolov3).cuda() print(yolov3) # define loss function. yolo_losses = [YOlOLoss(conf_yolov3, i).cuda() for i in range(3)] # define optimize function optimizer = optim.SGD(yolov3.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay) # set iteration numbers. epoch_size = len(data_set) // batch_size max_iter = epoch_size * epoch adjust = 0 # start iteration for iteration in range(max_iter): if iteration % epoch_size == 0: # recreate batch iterator batch_iter = iter( DataLoader(data_set, batch_size, shuffle=True, num_workers=6, collate_fn=detection_collate)) # auto adjust lr if (iteration / float(epoch_size)) % (epoch / 3) == 0: lr_ = adjust_learning_rate(lr, optimizer, gamma, epoch, adjust, iteration, epoch_size) # count time load_t0 = time.time() # generate data images, targets = next(batch_iter) images = images.cuda() targets = [anno.cuda() for anno in targets] # forward outputs = yolov3(images) # backward optimizer.zero_grad() loss = [] for i, output in enumerate(outputs): loss.append(yolo_losses[i](output, targets)) total_loss = sum(loss) total_loss.backward() optimizer.step() load_t1 = time.time() if iteration % 10 == 0: print('Epoch:' + repr(epoch) + ' || epochiter: ' + repr(iteration % epoch_size) + '/' + repr(epoch_size) + '|| Totel iter ' + repr(iteration) + ' ||Loss: %.4f||' % total_loss + 'Batch time: %.4f sec. ||' % (load_t1 - load_t0) + 'LR: %.8f' % optimizer.param_groups[0]['lr']) #lr)
def train(self, setting): print(self.model) print(sum(p.numel() for p in self.model.parameters())) train_data, train_loader = self._get_data( flag='train', data_dir= "/mnt/ufs18/home-052/surunze/biostat_project/archive_1/transcheckkernels1200/dataset/" ) print("train data loaded") vali_data, vali_loader = self._get_data( flag='val', data_dir= "/mnt/ufs18/home-052/surunze/biostat_project/archive_1/transcheckkernels1200/dataset/" ) print("valid data loaded") test_data, test_loader = self._get_data( flag='test', data_dir= "/mnt/ufs18/home-052/surunze/biostat_project/archive_1/transcheckkernels1200/dataset/" ) print("test data loaded") s = train_data print("train data train data", len(s[0])) print("train data train data", s[1][0].shape) print("train data train data", (s[0][0].shape), (s[1][0].shape), (s[2][0].shape), (s[3][0].shape)) print("train data train data", (s[0][1].shape), (s[1][1].shape), (s[2][1].shape), (s[3][1].shape)) print("train data train data", (s[0][2].shape), (s[1][2].shape), (s[2][2].shape), (s[3][2].shape)) print("train data train data", (s[0][3].shape), (s[1][3].shape), (s[2][3].shape), (s[3][3].shape)) path = os.path.join(self.args.checkpoints, setting) if not os.path.exists(path): os.makedirs(path) time_now = time.time() train_steps = len(train_loader) early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) model_optim = self._select_optimizer() criterion = self._select_criterion() if self.args.use_amp: scaler = torch.cuda.amp.GradScaler() for epoch in range(self.args.train_epochs): iter_count = 0 train_loss = [] self.model.train() epoch_time = time.time() for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader): iter_count += 1 model_optim.zero_grad() batch_x = batch_x.float().to(self.device) batch_y = batch_y.float().to(self.device) batch_x_mark = batch_x_mark.float().to(self.device) batch_y_mark = batch_y_mark.float().to(self.device) # decoder input dec_inp = torch.zeros_like( batch_y[:, -self.args.pred_len:, :]).float() dec_inp = torch.cat( [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device) # encoder - decoder if self.args.use_amp: with torch.cuda.amp.autocast(): if self.args.output_attention: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] else: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) f_dim = -1 if self.args.features == 'MS' else 0 batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) #print(outputs.shape, batch_y.shape) loss = criterion(outputs, batch_y) train_loss.append(loss.item()) else: if self.args.output_attention: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] else: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) if self.args.inverse: outputs = train_data.inverse_transform(outputs) f_dim = -1 if self.args.features == 'MS' else 0 batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) #print(outputs.shape, batch_y.shape) loss = criterion(outputs[:, :, 0], batch_y[:, :, 0]) train_loss.append(loss.item()) if (i + 1) % 100 == 0: print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format( i + 1, epoch + 1, loss.item())) speed = (time.time() - time_now) / iter_count left_time = speed * ( (self.args.train_epochs - epoch) * train_steps - i) print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format( speed, left_time)) iter_count = 0 time_now = time.time() if self.args.use_amp: scaler.scale(loss).backward() scaler.step(model_optim) scaler.update() else: loss.backward() model_optim.step() print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time)) train_loss = np.average(train_loss) #vali_loss = train_loss #test_loss = train_loss vali_loss = self.vali(vali_data, vali_loader, criterion) test_loss = self.test(setting) print("Training Summary", epoch + 1, train_steps, train_loss, vali_loss, test_loss) early_stopping(vali_loss, self.model, path) if early_stopping.early_stop: print("Early stopping") break adjust_learning_rate(model_optim, epoch + 1, self.args) best_model_path = path + '/' + 'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) return self.model
def train(self, setting): train_data, train_loader = self._get_data(flag='train') vali_data, vali_loader = self._get_data(flag='val') test_data, test_loader = self._get_data(flag='test') path = os.path.join(self.args.checkpoints, setting) if not os.path.exists(path): os.makedirs(path) time_now = time.time() train_steps = len(train_loader) early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) model_optim = self._select_optimizer() criterion = self._select_criterion() if self.args.use_amp: scaler = torch.cuda.amp.GradScaler() for epoch in range(self.args.train_epochs): iter_count = 0 train_loss = [] self.model.train() epoch_time = time.time() for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader): iter_count += 1 model_optim.zero_grad() batch_x = batch_x.float().to(self.device) batch_y = batch_y.float() batch_x_mark = batch_x_mark.float().to(self.device) batch_y_mark = batch_y_mark.float().to(self.device) # decoder input dec_inp = torch.zeros_like( batch_y[:, -self.args.pred_len:, :]).float() dec_inp = torch.cat( [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device) # encoder - decoder if self.args.use_amp: with torch.cuda.amp.autocast(): if self.args.output_attention: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] else: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) f_dim = -1 if self.args.features == 'MS' else 0 batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) loss = criterion(outputs, batch_y) train_loss.append(loss.item()) else: if self.args.output_attention: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] else: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) if self.args.inverse: outputs = train_data.inverse_transform(outputs) f_dim = -1 if self.args.features == 'MS' else 0 batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) loss = criterion(outputs, batch_y) train_loss.append(loss.item()) if (i + 1) % 100 == 0: print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format( i + 1, epoch + 1, loss.item())) speed = (time.time() - time_now) / iter_count left_time = speed * ( (self.args.train_epochs - epoch) * train_steps - i) print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format( speed, left_time)) iter_count = 0 time_now = time.time() if self.args.use_amp: scaler.scale(loss).backward() scaler.step(model_optim) scaler.update() else: loss.backward() model_optim.step() print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time)) train_loss = np.average(train_loss) vali_loss = self.vali(vali_data, vali_loader, criterion) test_loss = self.vali(test_data, test_loader, criterion) print( "Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}" .format(epoch + 1, train_steps, train_loss, vali_loss, test_loss)) early_stopping(vali_loss, self.model, path) if early_stopping.early_stop: print("Early stopping") break adjust_learning_rate(model_optim, epoch + 1, self.args) best_model_path = path + '/' + 'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) return self.model
def main(): lr = 5e-4 gamma = 0.2 num_classes = 21 epoch = 300 batch_size = 32 # data_path = '/mnt/storage/project/data/VOCdevkit/VOC2007' data_path = '~/datasets/VOC/VOCdevkit/VOC2007' # define data. data_set = LoadVocDataSets(data_path, 'trainval', AnnotationTransform(), PreProcess()) # generate default bbox priors = PriorBox(voc) prior_box = priors.forward().cuda() # define network. ssd = SSD(image_size=300, num_classes=num_classes).cuda() ssd.apply(weights_init) print(ssd) # load pretrain model # ssd.vgg.load_state_dict(torch.load("../premodel/vgg16_reducedfc.pth")) # ssd.load_state_dict(torch.load('./ssd_epoches_4524.pth')) # define loss function criterion = MultiBoxLoss(num_classes=num_classes, overlap_thresh=0.5, prior_for_matching=True, bkg_label=0, neg_mining=True, neg_pos=3, neg_overlap=0.5, encode_target=False) # define optimizer optimizer = optim.SGD(ssd.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) # set iteration numbers. epoch_size = len(data_set) // batch_size max_iter = epoch_size * epoch adjust = 0 # start iteration for iteration in range(max_iter): if iteration % epoch_size == 0: # create batch iterator batch_iter = iter( DataLoader(data_set, batch_size, shuffle=True, num_workers=6, collate_fn=detection_collate)) loc_loss = 0 conf_loss = 0 torch.save(ssd.state_dict(), 'ssd_epoches_' + repr(iteration) + '.pth') # auto adjust lr if (iteration / float(epoch_size)) % 50 == 0: lr_ = adjust_learning_rate(lr, optimizer, gamma, epoch, adjust, iteration, epoch_size) adjust += 1 # count time load_t0 = time.time() # load data images, targets = next(batch_iter) images = images.cuda() targets = [anno.cuda() for anno in targets] # forward output = ssd.forward(images) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(output, prior_box, targets) # calculate loss loss = loss_l + loss_c loss.backward() optimizer.step() # calculate total error loc_loss += loss_l.item() conf_loss += loss_c.item() load_t1 = time.time() if iteration % 10 == 0: print('Epoch:' + repr(epoch) + ' || epochiter: ' + repr(iteration % epoch_size) + '/' + repr(epoch_size) + '|| Totel iter ' + repr(iteration) + ' || L: %.4f C: %.4f||' % (loss_l.item(), loss_c.item()) + 'Batch time: %.4f sec. ||' % (load_t1 - load_t0) + 'LR: %.8f' % optimizer.param_groups[0]['lr']) #lr)