def train(train_loader, model, criterion, optimizer, epoch, args): model.train() total_loss = AverageMeter() total_acc = AverageMeter() #------------------------------------------------ # start iteration over current epoch for i, (head_patch, pos, att_gt) in enumerate(train_loader): optimizer.zero_grad() if args.cuda: head_patch = (torch.autograd.Variable(head_patch)).cuda() pos = (torch.autograd.Variable(pos)).cuda() att_gt = (torch.autograd.Variable(att_gt)).cuda() with torch.set_grad_enabled(True): # forward and calculate loss pred_att = model(head_patch, pos) train_loss = criterion(pred_att, att_gt) train_loss.backward() optimizer.step() total_loss.update(train_loss.item(), att_gt.shape[0]) pred = torch.argmax(pred_att, dim=-1) bv = (pred == att_gt.data) bv = bv.type(torch.cuda.FloatTensor) total_acc.update(torch.mean(bv).item(), att_gt.shape[0]) #assert att_gt.shape[0] ==args.batch_size, 'batch size wrong!' print( 'Epoch: {}/{} Iter: {} Training Loss: {:.4f} Training Acc: {:.4f}' .format(epoch, args.epochs - 1, i, train_loss.item(), torch.mean(bv))) # save tmp checkpoint if i % 300 == 0: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_acc': [], 'avg_epoch_acc': [], 'optimizer': optimizer.state_dict(), 'args': args }, is_best=False, directory=args.resume, version='batch_{}'.format(str(i))) return total_loss.avg, total_acc.avg
def train(train_loader, model, criterion, optimizer, epoch, args): model.train() train_loss_all = list() for i, (node_feature, edge_feature, AttMat, gt_label, node_num_rec) in enumerate(train_loader): optimizer.zero_grad() if args.cuda: node_feature = torch.autograd.Variable(node_feature.cuda()) edge_feature = torch.autograd.Variable(edge_feature.cuda()) AttMat = torch.autograd.Variable(AttMat.cuda()) gt_label = torch.autograd.Variable(gt_label.cuda()) node_num_rec = torch.autograd.Variable(node_num_rec.cuda()) sigmoid_pred_adj_mat, pred_label = model(node_feature, edge_feature, AttMat, node_num_rec, args) train_loss = 0 for sq_idx in range(pred_label.size()[1]): valid_node_num = node_num_rec[0, sq_idx] train_loss += criterion(sigmoid_pred_adj_mat[0, sq_idx, :valid_node_num, :valid_node_num], AttMat[0, sq_idx, :valid_node_num, :valid_node_num], pred_label[0, sq_idx, :valid_node_num, :], gt_label[0, sq_idx, :valid_node_num], args) train_loss_all.append(train_loss.cpu().item()) # visdom_viz(vis, train_loss_all, win=0, ylabel='training loss over batch', title='HGNN AttMat msg lstm', color='green') print('epoch [{}], batch [{}], training loss: {}, lr [{}]'.format(epoch, i, train_loss, optimizer.param_groups[0]['lr'])) train_loss.backward() optimizer.step() # TODO: why to decrease the lr every 300 iters when you have decreased the lr every epoch if i > 0 and i % 300 == 0: args.lr *= args.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = args.lr if i % 100 == 0: utils.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_error': [], 'avg_epoch_error': [], 'optimizer': optimizer.state_dict(), }, is_best=False, directory=args.resume, version='batch_{}'.format(str(i))) return np.mean(train_loss_all)
def main(args): args.cuda = args.use_cuda and torch.cuda.is_available() train_set, validate_set, test_set, train_loader, validate_loader, test_loader = get_data.get_data_atomic( args) #model = models.Atomic(args) model = models.Atomic_edge_only(args) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) #{'single': 0, 'mutual': 1, 'avert': 2, 'refer': 3, 'follow': 4, 'share': 5} criterion = [ torch.nn.CrossEntropyLoss( weight=torch.Tensor([0.05, 0.05, 0.25, 0.25, 0.25, 0.15])), torch.nn.MSELoss() ] # {'NA': 0, 'single': 1, 'mutual': 2, 'avert': 3, 'refer': 4, 'follow': 5, 'share': 6} scheduler = ReduceLROnPlateau(optimizer, factor=args.lr_decay, patience=1, verbose=True, mode='max') #-------------------------------------------------- # ------------------------ # use multi-gpu if args.cuda and torch.cuda.device_count() > 1: print("Now Using ", len(args.device_ids), " GPUs!") model = torch.nn.DataParallel(model, device_ids=args.device_ids, output_device=args.device_ids[0]).cuda() #model=model.cuda() criterion[0] = criterion[0].cuda() criterion[1] = criterion[1].cuda() elif args.cuda: model = model.cuda() criterion[0] = criterion[0].cuda() criterion[1] = criterion[1].cuda() if args.load_best_checkpoint: loaded_checkpoint = utils.load_best_checkpoint(args, model, optimizer, path=args.resume) if loaded_checkpoint: args, best_epoch_acc, avg_epoch_acc, model, optimizer = loaded_checkpoint if args.load_last_checkpoint: loaded_checkpoint = utils.load_last_checkpoint( args, model, optimizer, path=args.resume, version=args.model_load_version) if loaded_checkpoint: args, best_epoch_acc, avg_epoch_acc, model, optimizer = loaded_checkpoint # ------------------------------------------------------------------------------ # Start Training! since = time.time() train_epoch_acc_all = [] val_epoch_acc_all = [] best_acc = 0 avg_epoch_acc = 0 for epoch in range(args.start_epoch, args.epochs): train_epoch_loss, train_epoch_acc = train(train_loader, model, criterion, optimizer, epoch, args) train_epoch_acc_all.append(train_epoch_acc) val_epoch_loss, val_epoch_acc = validate(validate_loader, model, criterion, epoch, args) val_epoch_acc_all.append(val_epoch_acc) print('Epoch {}/{} Training Acc: {:.4f} Validation Acc: {:.4f}'.format( epoch, args.epochs - 1, train_epoch_acc, val_epoch_acc)) print('*' * 15) scheduler.step(val_epoch_acc) is_best = val_epoch_acc > best_acc if is_best: best_acc = val_epoch_acc avg_epoch_acc = np.mean(val_epoch_acc_all) utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_acc': best_acc, 'avg_epoch_acc': avg_epoch_acc, 'optimizer': optimizer.state_dict(), 'args': args }, is_best=is_best, directory=args.resume, version='epoch_{}'.format(str(epoch))) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best Val Acc: {}, Final Avg Val Acc: {}'.format( best_acc, avg_epoch_acc)) # ---------------------------------------------------------------------------------------------------------- # test loaded_checkpoint = utils.load_best_checkpoint(args, model, optimizer, path=args.resume) if loaded_checkpoint: args, best_epoch_acc, avg_epoch_acc, model, optimizer = loaded_checkpoint test_loader.dataset.round_cnt = { 'single': 0, 'mutual': 0, 'avert': 0, 'refer': 0, 'follow': 0, 'share': 0 } test_loss, test_acc, confmat, top2_acc = test(test_loader, model, criterion, args) # save test results if not isdir(args.save_test_res): os.mkdir(args.save_test_res) with open(os.path.join(args.save_test_res, 'raw_test_results.pkl'), 'w') as f: pickle.dump([test_loss, test_acc, confmat, top2_acc], f) print("Test Acc {}".format(test_acc)) print("Top 2 Test Acc {}".format(top2_acc)) # todo: need to change the mode here! get_metric_from_confmat(confmat, 'atomic')
def train(train_loader, model, criterion, optimizer, epoch, args): model.train() total_loss = AverageMeter() total_acc = AverageMeter() # todo: check the round cnt is correct! train_loader.dataset.round_cnt = { 'single': 0, 'mutual': 0, 'avert': 0, 'refer': 0, 'follow': 0, 'share': 0 } #print(train_loader.dataset.round_cnt) # ------------------------------------------------ # start iteration over current epoch for i, (head_batch, pos_batch, attmat_batch, atomic_label_batch) in enumerate(train_loader): # sl_gt [N, 10, max_node_num] batch_size = head_batch.shape[0] #assert batch_size==args.batch_size, 'wrong batch size!{} {}'.format(batch_size, args.batch_size) optimizer.zero_grad() if args.cuda: heads = (torch.autograd.Variable(head_batch)).cuda() poses = (torch.autograd.Variable(pos_batch)).cuda() attmat_gt = (torch.autograd.Variable(attmat_batch)).cuda() atomic_gt = (torch.autograd.Variable(atomic_label_batch)).cuda() #ID_batch= (torch.autograd.Variable(ID_batch)).cuda() with torch.set_grad_enabled(True): # forward and calculate loss pred_atomic = model(heads, poses, attmat_gt) # [N,6] train_loss = 0 for bid in range(batch_size): # todo:check pre_atomic dim [N,6,1,1,1]?? tmp_loss = criterion[0](pred_atomic[bid, :].unsqueeze(0), atomic_gt[bid].unsqueeze(0)) # print('label loss', criterion[0](sl_pred[nid][bid, :].unsqueeze(0), sl_gt[bid, nid].unsqueeze(0))) # print('attmat loss', criterion[1](attmat_pred, attmat_gt)) total_loss.update(tmp_loss.item(), 1) train_loss = train_loss + tmp_loss pred = torch.argmax(pred_atomic[bid, :], dim=0) bv = (pred == atomic_gt[bid].data) bv = bv.type(torch.cuda.FloatTensor) total_acc.update(bv.item(), 1) train_loss.backward() #plot_grad_flow(model.named_parameters()) optimizer.step() print( 'Epoch: {}/{} Iter: {} Training Loss: {:.4f} Total Avg Acc: {:.4f}' .format(epoch, args.epochs - 1, i, train_loss.item(), total_acc.avg)) # save tmp checkpoint if i % 300 == 0: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_acc': [], 'avg_epoch_acc': [], 'optimizer': optimizer.state_dict(), 'args': args }, is_best=False, directory=args.resume, version='batch_{}'.format(str(i))) #print(train_loader.dataset.round_cnt) return total_loss.avg, total_acc.avg
def main(args): args.cuda = args.use_cuda and torch.cuda.is_available() train_set, validate_set, test_set, train_loader, validate_loader, test_loader = get_data.get_data_attmat( args) model = models.AttMat(args) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) criterion = torch.nn.CrossEntropyLoss() scheduler = ReduceLROnPlateau(optimizer, factor=args.lr_decay, patience=1, verbose=True, mode='max') #------------------------ # use multi-gpu if args.cuda and torch.cuda.device_count() > 1: print("Now Using ", len(args.device_ids), " GPUs!") model = torch.nn.DataParallel(model, device_ids=args.device_ids, output_device=args.device_ids[0]).cuda() criterion = criterion.cuda() elif args.cuda: model = model.cuda() criterion = criterion.cuda() if args.load_best_checkpoint: loaded_checkpoint = utils.load_best_checkpoint(args, model, optimizer, path=args.resume) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer = loaded_checkpoint if args.load_last_checkpoint: loaded_checkpoint = utils.load_last_checkpoint( args, model, optimizer, path=args.resume, version=args.model_load_version) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer = loaded_checkpoint # ------------------------------------------------------------------------------ # Start Training! since = time.time() train_epoch_acc_all = [] val_epoch_acc_all = [] best_acc = 0 avg_epoch_acc = 0 for epoch in range(args.start_epoch, args.epochs): train_epoch_loss, train_epoch_acc = train(train_loader, model, criterion, optimizer, epoch, args) train_epoch_acc_all.append(train_epoch_acc) val_epoch_loss, val_epoch_acc = validate(validate_loader, model, criterion, epoch, args) val_epoch_acc_all.append(val_epoch_acc) print('Epoch {}/{} Training Acc: {:.4f} Validation Acc: {:.4f}'.format( epoch, args.epochs - 1, train_epoch_acc, val_epoch_acc)) print('*' * 15) scheduler.step(val_epoch_acc) is_best = val_epoch_acc > best_acc if is_best: best_acc = val_epoch_acc avg_epoch_acc = np.mean(val_epoch_acc_all) utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_acc': best_acc, 'avg_epoch_acc': avg_epoch_acc, 'optimizer': optimizer.state_dict(), 'args': args }, is_best=is_best, directory=args.resume, version='epoch_{}'.format(str(epoch))) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best Val Acc: {}, Final Avg Val Acc: {}'.format( best_acc, avg_epoch_acc)) #---------------------------------------------------------------------------------------------------------- # test loaded_checkpoint = utils.load_best_checkpoint(args, model, optimizer, path=args.resume) if loaded_checkpoint: args, best_epoch_acc, avg_epoch_acc, model, optimizer = loaded_checkpoint # test_loss, test_acc, one_acc, zero_acc = test(test_loader, model, criterion, args) print("Test Acc {}, One Acc {}, Zero Acc {}".format( test_acc, one_acc, zero_acc)) # save test results if not isdir(args.save_test_res): os.mkdir(args.save_test_res) with open(os.path.join(args.save_test_res, 'raw_test_results.pkl'), 'w') as f: pickle.dump([test_loss, test_acc, one_acc, zero_acc], f)
def main(args): args.cuda = args.use_cuda and torch.cuda.is_available() train_set, validate_set, test_set, train_loader, validate_loader, test_loader = get_data.get_data_AttMat_msg_lstm( args) model_args = {'roi_feature_size': args.roi_feature_size, 'edge_feature_size': args.roi_feature_size, 'node_feature_size': args.roi_feature_size, 'message_size': args.message_size, 'link_hidden_size': args.link_hidden_size, 'link_hidden_layers': args.link_hidden_layers, 'propagate_layers': args.propagate_layers, 'big_attr_classes': args.big_attr_class_num, 'lstm_hidden_size': args.lstm_hidden_size} model = models.AttMat_msg_lstm(model_args, args) # TODO: check grads and then to set the learning rate for Adam optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # criterion=torch.nn.CrossEntropyLoss() if args.cuda: model = model.cuda() # criterion=criterion.cuda( ) if args.load_best_checkpoint: loaded_checkpoint = utils.load_best_checkpoint(args, model, optimizer, path=args.resume) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer = loaded_checkpoint if args.load_last_checkpoint: loaded_checkpoint = utils.load_last_checkpoint(args, model, optimizer, path=args.resume, version=args.model_load_version) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer = loaded_checkpoint train_error_history = list() train_loss_history = list() val_error_history = list() val_loss_history = list() best_epoch_error = np.inf for epoch in range(args.start_epoch, args.epochs): train_error_rate_cur_epoch, train_loss_cur_epoch = train(train_loader, model, criterion, optimizer, epoch, args) train_error_history.append(train_error_rate_cur_epoch) train_loss_history.append(train_loss_cur_epoch) val_error_rate_cur_epoch, val_loss_cur_epoch = validate(validate_loader, model, criterion, args) val_error_history.append(val_error_rate_cur_epoch) val_loss_history.append(val_loss_cur_epoch) # TODO: why use this schedule for adjusting learning rate, there is no need to decrease lr for Adam every epoch if epoch > 0 and epoch % 1 == 0: args.lr *= args.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = args.lr is_best = val_error_rate_cur_epoch < best_epoch_error best_epoch_error = min(val_error_rate_cur_epoch, best_epoch_error) avg_epoch_error = np.mean(val_error_history) utils.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_error': best_epoch_error, 'avg_epoch_error': avg_epoch_error, 'optimizer': optimizer.state_dict(), }, is_best=is_best, directory=args.resume, version='epoch_{}'.format(str(epoch))) print('best_epoch_error: {}, avg_epoch_error: {}'.format(best_epoch_error, avg_epoch_error)) # test # loaded_checkpoint=utils.load_best_checkpoint(args,model,optimizer,path=args.resume) loaded_checkpoint = utils.load_last_checkpoint(args, model, optimizer, path=args.resume, version=args.model_load_version) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer = loaded_checkpoint test(test_loader, model, args)
def train(train_loader, model, criterion, optimizer, epoch, args): model.train() total_loss = AverageMeter() total_acc = AverageMeter() # todo: check the round cnt is correct! train_loader.dataset.round_cnt = { 'SingleGaze': 0, 'GazeFollow': 0, 'AvertGaze': 0, 'MutualGaze': 0, 'JointAtt': 0 } #print(train_loader.dataset.round_cnt) # ------------------------------------------------ # start iteration over current epoch for i, (patches, poses, sl_gt, num_rec, attmat_gt) in enumerate(train_loader): # sl_gt [N, 10, max_node_num] batch_size = sl_gt.shape[0] #assert batch_size==args.batch_size, 'wrong batch size!{} {}'.format(batch_size, args.batch_size) optimizer.zero_grad() if args.cuda: patches = (torch.autograd.Variable(patches)).cuda() poses = (torch.autograd.Variable(poses)).cuda() sl_gt = (torch.autograd.Variable(sl_gt)).cuda() num_rec = (torch.autograd.Variable(num_rec)).cuda() attmat_gt = (torch.autograd.Variable(attmat_gt)).cuda() with torch.set_grad_enabled(True): # forward and calculate loss sl_pred, sl_pred0 = model(patches, poses, num_rec, attmat_gt) # sl_pred [N, 10, 6 , 7] attmat_pred [N, 10, 6, 6] train_loss = 0 for bid in range(batch_size): for sq_id in range(10): valid_node_num = num_rec[bid, sq_id] for nid in range(valid_node_num): tmp_loss = criterion[0](sl_pred[bid, sq_id, nid, :].unsqueeze(0), sl_gt[bid, sq_id, nid].unsqueeze(0)) \ + criterion[0](sl_pred0[bid, sq_id, nid, :].unsqueeze(0), sl_gt[bid, sq_id, nid].unsqueeze(0)) # print('label loss', criterion[0](sl_pred[nid][bid, :].unsqueeze(0), sl_gt[bid, nid].unsqueeze(0))) # print('attmat loss', criterion[1](attmat_pred, attmat_gt)) total_loss.update(tmp_loss.item(), 1) train_loss = train_loss + tmp_loss pred = torch.argmax(sl_pred[bid, sq_id, nid, :], dim=-1) bv = (pred == sl_gt[bid, sq_id, nid].data) bv = bv.type(torch.cuda.FloatTensor) total_acc.update(bv.item(), 1) # auc=utils.MAUC(sl_gt[bid, nid].data, sl_pred[nid][bid, :], 7) # total_auc.update(auc.item(), 1) # assert len(sl_pred[bid, nid, :])==7, "wrong class number!" train_loss.backward() #plot_grad_flow(model.named_parameters()) optimizer.step() print( 'Epoch: {}/{} Iter: {} Training Loss: {:.4f} Total Avg Acc: {:.4f}' .format(epoch, args.epochs - 1, i, train_loss.item(), total_acc.avg)) # save tmp checkpoint if i % 300 == 0: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_acc': [], 'avg_epoch_acc': [], 'optimizer': optimizer.state_dict(), 'args': args }, is_best=False, directory=args.resume, version='batch_{}'.format(str(i))) #print(train_loader.dataset.round_cnt) return total_loss.avg, total_acc.avg
def main(args): args.cuda = args.use_cuda and torch.cuda.is_available() train_set, validate_set, test_set, train_loader, validate_loader, test_loader = get_data.get_data_resnet_msgpassing_balanced_lstm(args) model_args = {'roi_feature_size':args.roi_feature_size,'edge_feature_size': args.roi_feature_size, 'node_feature_size': args.roi_feature_size, 'message_size': args.message_size, 'link_hidden_size': args.link_hidden_size, 'link_hidden_layers': args.link_hidden_layers, 'propagate_layers': args.propagate_layers, 'big_attr_classes': args.big_attr_class_num, 'lstm_hidden_size':args.lstm_hidden_size} model = models.HGNN_resnet_msgpassing_balanced_lstm(model_args) optimizer=torch.optim.Adam(model.parameters(),lr=args.lr) criterion=torch.nn.CrossEntropyLoss() if args.cuda: model=model.cuda() criterion=criterion.cuda( ) if args.load_best_checkpoint: loaded_checkpoint=utils.load_best_checkpoint(args,model,optimizer,path=args.resume) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer=loaded_checkpoint if args.load_last_checkpoint: loaded_checkpoint=utils.load_last_checkpoint(args,model,optimizer,path=args.resume) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer=loaded_checkpoint train_error_history=list() train_loss_history=list() val_error_history=list() val_loss_history=list() best_epoch_error=np.inf for epoch in range(args.start_epoch, args.epochs): train_error_rate_cur_epoch, train_loss_cur_epoch=train(train_loader,model,criterion,optimizer,epoch,args) train_error_history.append(train_error_rate_cur_epoch) train_loss_history.append(train_loss_cur_epoch) val_error_rate_cur_epoch, val_loss_cur_epoch=validate(validate_loader,model,criterion,args) val_error_history.append(val_error_rate_cur_epoch) val_loss_history.append(val_loss_cur_epoch) if epoch>0 and epoch%1==0: args.lr*=args.lr_decay for param_group in optimizer.param_groups: param_group['lr']=args.lr is_best=val_error_rate_cur_epoch<best_epoch_error best_epoch_error=min(val_error_rate_cur_epoch,best_epoch_error) avg_epoch_error=np.mean(val_error_history) utils.save_checkpoint({ 'epoch':epoch+1, 'state_dict':model.state_dict(), 'best_epoch_error':best_epoch_error, 'avg_epoch_error':avg_epoch_error, 'optimizer':optimizer.state_dict(),},is_best=is_best,directory=args.resume) print('best_epoch_error: {}, avg_epoch_error: {}'.format(best_epoch_error, avg_epoch_error)) # test #loaded_checkpoint=utils.load_best_checkpoint(args,model,optimizer,path=args.resume) loaded_checkpoint = utils.load_last_checkpoint(args, model, optimizer, path=args.resume) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer=loaded_checkpoint test(test_loader,model,args)
def train(train_loader,model,criterion,optimizer,epoch,args): model.train() train_loss_all=list() train_error_rate_all=list() for i, (node_feature, edge_feature, gt_label,node_num_rec) in enumerate(train_loader): optimizer.zero_grad() if args.cuda: node_feature = torch.autograd.Variable(node_feature.cuda()) edge_feature = torch.autograd.Variable(edge_feature.cuda()) gt_label = torch.autograd.Variable(gt_label.cuda()) node_num_rec=torch.autograd.Variable(node_num_rec.cuda()) pred_label=model(node_feature, edge_feature,node_num_rec,args) for sq_idx in range(pred_label.size()[1]): valid_node_num=node_num_rec[0,sq_idx] if sq_idx==0: pred_label_all=pred_label[0,sq_idx,:valid_node_num,:] gt_label_all=gt_label[0,sq_idx,:valid_node_num] else: pred_label_all=torch.cat((pred_label_all,pred_label[0,sq_idx,:valid_node_num,:]),dim=0) gt_label_all=torch.cat((gt_label_all,gt_label[0,sq_idx,:valid_node_num]),dim=0) error_rate = evaluation(pred_label.unsqueeze(0), gt_label.unsqueeze(0)) train_error_rate_all.append(error_rate) train_loss=criterion(pred_label_all, gt_label_all) train_loss_all.append(train_loss.data.cpu().numpy().item()) visdom_viz(vis, train_loss_all, win=0, ylabel='training loss over batch', title='HGNN Resnet Msgpassing balanced lstm', color='green') print('epoch [{}], batch [{}], training loss: {}, training error rate: {}, lr [{}]'.format(epoch, i,train_loss,error_rate, optimizer.param_groups[0]['lr'])) train_loss.backward() optimizer.step() if i > 0 and i % 300 == 0: args.lr *= args.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = args.lr if i%200==0: utils.save_checkpoint({ 'epoch':epoch+1, 'state_dict':model.state_dict(), 'best_epoch_error':[], 'avg_epoch_error':[], 'optimizer':optimizer.state_dict(),},is_best=False,directory=args.resume) del node_feature, edge_feature, gt_label,node_num_rec return np.mean(train_error_rate_all), np.mean(train_loss_all)
def main(args): args.cuda = args.use_cuda and torch.cuda.is_available() train_set, validate_set, test_set, train_loader, validate_loader, test_loader = get_data.get_data_resnet_fc( args) model = models.HGNN_resnet_fc() # TODO: try to use the step policy for Adam, also consider the step interval optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # TODO: related to line 99 (e.g. max for auc, min for loss; try to check the definition of this method) scheduler = ReduceLROnPlateau(optimizer, factor=args.lr_decay, patience=1, verbose=True, mode='min') # TODO; double check this loss, also the output of the network criterion = torch.nn.CrossEntropyLoss() # criterion=torch.nn.MSELoss() #------------------------ # use multi-gpu if args.cuda and torch.cuda.device_count() > 1: print("Now Using ", len(args.device_ids), " GPUs!") #model=model.to(device_ids[0]) model = torch.nn.DataParallel(model, device_ids=args.device_ids, output_device=args.device_ids[0]).cuda() criterion = criterion.cuda() elif args.cuda: model = model.cuda() criterion = criterion.cuda() if args.load_best_checkpoint: loaded_checkpoint = utils.load_best_checkpoint(args, model, optimizer, path=args.resume) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer = loaded_checkpoint if args.load_last_checkpoint: loaded_checkpoint = utils.load_last_checkpoint( args, model, optimizer, path=args.resume, version=args.model_load_version) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer = loaded_checkpoint # for param_group in optimizer.param_groups: # param_group['lr'] = args.lr #------------------------------------------------------------------------------ # Train since = time.time() train_epoch_loss_all = [] val_epoch_loss_all = [] best_loss = np.inf avg_epoch_loss = np.inf for epoch in range(args.start_epoch, args.epochs): train_epoch_loss = train(train_loader, model, criterion, optimizer, epoch, args) train_epoch_loss_all.append(train_epoch_loss) #visdom_viz(vis, train_epoch_loss_all, win=0, ylabel='Training Epoch Loss', title=args.project_name, color='green') val_epoch_loss = validate(validate_loader, model, criterion, epoch, args) val_epoch_loss_all.append(val_epoch_loss) #visdom_viz(vis, val_epoch_loss_all, win=1, ylabel='Validation Epoch Loss', title=args.project_name,color='blue') print( 'Epoch {}/{} Training Loss: {:.4f} Validation Loss: {:.4f}'.format( epoch, args.epochs - 1, train_epoch_loss, val_epoch_loss)) print('*' * 15) #TODO: reducing lr when there is no gains on validation metric results (e.g. auc, loss) scheduler.step(val_epoch_loss) is_best = val_epoch_loss < best_loss if is_best: best_loss = val_epoch_loss avg_epoch_loss = np.mean(val_epoch_loss_all) utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_error': best_loss, 'avg_epoch_error': avg_epoch_loss, 'optimizer': optimizer.state_dict() }, is_best=is_best, directory=args.resume, version='epoch_{}'.format(str(epoch))) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best Val Loss: {}, Final Avg Val Loss: {}'.format( best_loss, avg_epoch_loss)) #------------------------------------------------------------------------------------------------------------- # test loaded_checkpoint = utils.load_best_checkpoint(args, model, optimizer, path=args.resume) if loaded_checkpoint: args, best_epoch_error, avg_epoch_error, model, optimizer = loaded_checkpoint pred_label, gt_label, test_loss = test(test_loader, model, criterion, args) print("Test Epoch Loss {}".format(test_loss)) # save test results if not isdir(args.save_test_res): os.mkdir(args.save_test_res) with open(os.path.join(args.save_test_res, 'raw_test_results.pkl'), 'w') as f: pickle.dump([pred_label, gt_label, test_loss], f) #todo: check get_test_metric recall, precision, F_one_score, acc, avg_acc, ConfMat = get_test_metric( pred_label, gt_label, args) print( '[====Test results Small Attr====] \n recall: {} \n precision: {} \n F1 score: {} \n acc: {} \n avg acc: {} \n Confusion Matrix: \n {}' .format(recall, precision, F_one_score, acc, avg_acc, ConfMat))
def train(train_loader, model, criterion, optimizer, epoch, args): model.train() #train_error_rate_all = list() running_loss = 0 node_cnt = 0 loss_on = [] # iterate over training data # node_feature (node_num,3,224,224) gt_label (44,6) for i, (node_feature, gt_label) in enumerate(train_loader): if args.cuda: node_feature = torch.autograd.Variable( node_feature.cuda(args.device_ids[0])) gt_label = torch.autograd.Variable( gt_label.cuda(args.device_ids[0])) optimizer.zero_grad() with torch.set_grad_enabled(True): # forward and calculate loss pred_label = model(node_feature) #print("Outside: input_size", node_feature.size(), "output_size", pred_label.size()) # error_rate = evaluation(pred_label.unsqueeze(0), gt_label.unsqueeze(0)) # train_error_rate_all.append(error_rate) #todo: check the dimension change, see if it's valid or not train_loss = criterion(pred_label, torch.argmax(gt_label, dim=-1)) train_loss.backward() optimizer.step() loss_on.append(train_loss.item()) print('Epoch: {}/{} Iter: {} Training Loss: {:.4F}'.format( epoch, args.epochs, i, train_loss.item())) # if args.visdom: # visdom_viz(vis, loss_on, win=2, ylabel='Train Online Loss', title=args.project_name, color='pink') # train_loss = criterion(pred_label.view(-1, 6), gt_label) running_loss += train_loss.item() * pred_label.shape[0] node_cnt += pred_label.shape[0] # save tmp checkpoint if i % 300 == 0: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_epoch_error': [], 'avg_epoch_error': [], 'optimizer': optimizer.state_dict() }, is_best=False, directory=args.resume, version='batch_{}'.format(str(i))) # reducing lr during iterations, there is no need to reducing the lr every epoch # below is the 'poly' lr policy always used for changing lr during iteration # update lr during iteration # todo: lr update criterion # if True: # iters_per_epoch = len(train_loader) # lr = adjust_learning_rate(optimizer, epoch, i, iters_per_epoch) # for param_group in optimizer.param_groups: # param_group['lr'] = lr epoch_loss = running_loss / node_cnt return epoch_loss