def __init__(self, option, model, train_dataset, valid_dataset, test_dataset=None, weight=[[1.0, 1.0]], tasks_num=17): # Most important variable self.option = option self.device = torch.device("cuda:{}".format(option['gpu'][0]) if torch.cuda.is_available() else "cpu") self.model = DataParallel(model).to(self.device) if option['parallel'] else model.to(self.device) # Setting the train valid and test data loader if self.option['parallel']: self.train_dataloader = DataListLoader(train_dataset, batch_size=self.option['batch_size'], shuffle=True) self.valid_dataloader = DataListLoader(valid_dataset, batch_size=self.option['batch_size']) if test_dataset: self.test_dataloader = DataListLoader(test_dataset, batch_size=self.option['batch_size']) else: self.train_dataloader = DataLoader(train_dataset, batch_size=self.option['batch_size'], shuffle=True) self.valid_dataloader = DataLoader(valid_dataset, batch_size=self.option['batch_size']) if test_dataset: self.test_dataloader = DataLoader(test_dataset, batch_size=self.option['batch_size']) self.save_path = self.option['exp_path'] # Setting the Adam optimizer with hyper-param if option['focalloss']: self.log('Using FocalLoss') self.criterion = [FocalLoss(alpha=1 / w[0]) for w in weight] # alpha 0.965 else: self.criterion = [torch.nn.CrossEntropyLoss(torch.Tensor(w).to(self.device), reduction='mean') for w in weight] self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.option['lr'], weight_decay=option['weight_decay']) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode='min', factor=0.7, patience=self.option['lr_scheduler_patience'], min_lr=1e-6 ) # other self.start = time.time() self.tasks_num = tasks_num self.records = {'trn_record': [], 'val_record': [], 'val_losses': [], 'best_ckpt': None, 'val_roc': [], 'val_prc': []} self.log(msgs=['\t{}:{}\n'.format(k, v) for k, v in self.option.items()], show=False) self.log('train set num:{} valid set num:{} test set num: {}'.format( len(train_dataset), len(valid_dataset), len(test_dataset))) self.log("total parameters:" + str(sum([p.nelement() for p in self.model.parameters()]))) self.log(msgs=str(model).split('\n'), show=False)
def init_model(model_cls, log_dir_base, fold_no, device_ids=None, use_gpu=False, dp=False, ddp=False, tb_dir='runs', lr=1e-3, weight_decay=1e-2): writer = SummaryWriter(log_dir=osp.join(tb_dir, log_dir_base)) model = model_cls(writer) writer.add_text('model_summary', model.__repr__()) optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=weight_decay, amsgrad=False) # scheduler_reduce = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5) # scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=5) # scheduler = scheduler_reduce # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay) if dp and use_gpu: model = model.cuda() if device_ids is None else model.to(device_ids[0]) model = DataParallel(model, device_ids=device_ids) elif use_gpu: model = model.to(device_ids[0]) device_count = torch.cuda.device_count() if dp else 1 device_count = len(device_ids) if (device_ids is not None and dp) else device_count return model, optimizer, writer, device_count
def train_RGNN(tr_dataset, te_dataset, n_epochs, batch_size, lr, z_dim, K, dropout, adj_type, learn_edge, lambda1, lambda2, domain_adaptation, lambda_dat, label_type, ckpt_save_name=None, ckpt_load=None): # log hyper-parameter logger.critical('batch_size {}, lr {}, z_dim {}, K {}, dropout {}, adj_type {}, learn_edge {}, lambda1 {},' 'lambda2 {}, domain_adaptation {}, lambda_dat {}, label_type {}' .format(batch_size, lr, z_dim, K, dropout, adj_type, learn_edge, lambda1, lambda2, domain_adaptation, lambda_dat, label_type)) # parameter sanity check if label_type not in label_types: raise Exception("undefined label_type") if adj_type not in adj_types: raise Exception("undefined adj_type") # construct model edge_weight = initial_adjacency_matrix(adj_type) model = SymSimGCNNet(n_channels, learn_edge, edge_weight, n_bands, [z_dim], n_classes[label_type], K, dropout, domain_adaptation) last_epoch = 0 if ckpt_load is not None: ckpt = torch.load(ckpt_load) last_epoch = ckpt_load["epoch"] if last_epoch >= n_epochs: raise Exception("loaded model have trained >= n_epochs") state_dict = ckpt_load["state_dict"] model.load_state_dict(state_dict) # use multiple GPU model = DataParallel(model, device_ids=device_ids).to(device) logger.info(model) # prepare dataloader logger.info("tr_dataset: {}".format(tr_dataset)) logger.info("te_dataset: {}".format(te_dataset)) logger.info("training start from epoch {}".format(last_epoch)) tr_loader = DataListLoader(tr_dataset, batch_size, True) # prepare optimizer param_list1 = [] param_list2 = [] for name, param in model.named_parameters(): if name in ['module.edge_weight', 'module.conv1.lin.bias', 'module.fc.bias']: param_list1.append(param) else: param_list2.append(param) optimizer = torch.optim.Adam([ {'params': param_list1, 'weight_decay': 0}, {'params': param_list2, 'weight_decay': lambda2} ], lr=lr) # iterate over all epochs eval_acc_list = [] macro_f1_list = [] for ep in range(last_epoch + 1, n_epochs + 1): model.train() loss_all = 0 reverse_scale = 2 / (1 + math.exp(-10 * ep / n_epochs)) - 1 if domain_adaptation == 'RevGrad': model.module.alpha = reverse_scale # iterate over all graphs for tr_data_list in tr_loader: # output shape (len(tr_data_list), 5 or 1) output, domain_output = model(tr_data_list) # classification loss # y shape (len(tr_data_list), ) y = torch.cat([data.y for data in tr_data_list]).to(output.device) if label_type == "hard": loss = F.cross_entropy(output, y) elif label_type == "soft": loss = - distribution_label(y) * F.log_softmax(output, dim=1) loss = torch.mean(torch.sum(loss, dim=1)) else: loss = F.mse_loss(output, y - 2) # l1 regularization loss if learn_edge: loss += lambda1 * torch.sum(torch.abs(model.module.edge_weight)) # domain adaptation loss if domain_adaptation: # tr_data.x: [num_graph * n_channels, feature_dim] n_nodes = domain_output.size(0) loss += lambda_dat * F.cross_entropy(domain_output, torch.zeros(n_nodes).cuda()) te_indices = torch.randint(0, len(te_dataset), len(tr_data_list)) te_data = te_dataset[te_indices] _, te_domain_output = model(te_data) loss += lambda_dat * F.cross_entropy(te_domain_output, torch.ones(n_nodes).cuda()) loss_all += loss.item() * len(tr_data_list) # optimize the model optimizer.zero_grad() loss.backward() optimizer.step() # evaluate the model accuracy, macro_f1_score = evaluate_RGNN(model, te_dataset, label_type) eval_acc_list.append(accuracy) macro_f1_list.append(macro_f1_score) train_acc, _ = evaluate_RGNN(model, tr_dataset, label_type) logger.info('epoch: {:4d}; loss: {:9.5f}; train acc: {:9.5f}; eval acc: {:9.5f}; ' 'macro f1: {:9.5f};' .format(ep, loss_all/len(tr_dataset), train_acc, accuracy, macro_f1_score)) # save model checkpoint logger.info(list(model.parameters())) logger.info(format_list(model.module.edge_weight.detach().cpu().numpy().flatten())) if ckpt_save_name is not None: checkpoint = {"epoch": n_epochs, "state_dict": model.state_dict()} torch.save(checkpoint, ckpt_dir + '/' + ckpt_save_name) return eval_acc_list, macro_f1_list
def main(args): batch_size = args.batch_size model_fname = args.mod_name if multi_gpu and batch_size < torch.cuda.device_count(): exit('Batch size too small') # make a folder for the graphs of this model Path(args.output_dir).mkdir(exist_ok=True) save_dir = osp.join(args.output_dir, model_fname) Path(save_dir).mkdir(exist_ok=True) # get dataset and split gdata = GraphDataset(root=args.input_dir, bb=args.box_num) # merge data from separate files into one contiguous array bag = [] for g in gdata: bag += g random.Random(0).shuffle(bag) bag = bag[:args.num_data] # temporary patch to use px, py, pz for d in bag: d.x = d.x[:, :3] # 80:10:10 split datasets fulllen = len(bag) train_len = int(0.8 * fulllen) tv_len = int(0.10 * fulllen) train_dataset = bag[:train_len] valid_dataset = bag[train_len:train_len + tv_len] test_dataset = bag[train_len + tv_len:] train_samples = len(train_dataset) valid_samples = len(valid_dataset) test_samples = len(test_dataset) if multi_gpu: train_loader = DataListLoader(train_dataset, batch_size=batch_size, pin_memory=True, shuffle=True) valid_loader = DataListLoader(valid_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) test_loader = DataListLoader(test_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) else: train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) # specify loss function loss_ftn_obj = LossFunction(args.loss, emd_modname=args.emd_model_name, device=device) # create model input_dim = 3 big_dim = 32 hidden_dim = args.lat_dim lr = args.lr patience = args.patience if args.model == 'MetaLayerGAE': model = models.GNNAutoEncoder() else: if args.model[-3:] == 'EMD': model = getattr(models, args.model)(input_dim=input_dim, big_dim=big_dim, hidden_dim=hidden_dim, emd_modname=args.emd_model_name) else: model = getattr(models, args.model)(input_dim=input_dim, big_dim=big_dim, hidden_dim=hidden_dim) optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=4) valid_losses = [] train_losses = [] start_epoch = 0 n_epochs = 200 # load in model modpath = osp.join(save_dir, model_fname + '.best.pth') try: model.load_state_dict(torch.load(modpath)) train_losses, valid_losses, start_epoch = torch.load( osp.join(save_dir, 'losses.pt')) print('Loaded model') best_valid_loss = test(model, valid_loader, valid_samples, batch_size, loss_ftn_obj) print(f'Saved model valid loss: {best_valid_loss}') except: print('Creating new model') best_valid_loss = 9999999 if multi_gpu: model = DataParallel(model) model.to(torch.device(device)) # Training loop stale_epochs = 0 loss = best_valid_loss for epoch in range(start_epoch, n_epochs): if multi_gpu: loss = train_parallel(model, optimizer, train_loader, train_samples, batch_size, loss_ftn_obj) valid_loss = test_parallel(model, valid_loader, valid_samples, batch_size, loss_ftn_obj) else: loss = train(model, optimizer, train_loader, train_samples, batch_size, loss_ftn_obj) valid_loss = test(model, valid_loader, valid_samples, batch_size, loss_ftn_obj) scheduler.step(valid_loss) train_losses.append(loss) valid_losses.append(valid_loss) print('Epoch: {:02d}, Training Loss: {:.4f}'.format(epoch, loss)) print(' Validation Loss: {:.4f}'.format(valid_loss)) if valid_loss < best_valid_loss: best_valid_loss = valid_loss print('New best model saved to:', modpath) if multi_gpu: torch.save(model.module.state_dict(), modpath) else: torch.save(model.state_dict(), modpath) torch.save((train_losses, valid_losses, epoch + 1), osp.join(save_dir, 'losses.pt')) stale_epochs = 0 else: stale_epochs += 1 print( f'Stale epoch: {stale_epochs}\nBest: {best_valid_loss}\nCurr: {valid_loss}' ) if stale_epochs >= patience: print('Early stopping after %i stale epochs' % patience) break # model training done train_epochs = list(range(epoch + 1)) early_stop_epoch = epoch - stale_epochs loss_curves(train_epochs, early_stop_epoch, train_losses, valid_losses, save_dir) # compare input and reconstructions model.load_state_dict(torch.load(modpath)) input_fts = [] reco_fts = [] for t in valid_loader: model.eval() if isinstance(t, list): for d in t: input_fts.append(d.x) else: input_fts.append(t.x) t.to(device) reco_out = model(t) if isinstance(reco_out, tuple): reco_out = reco_out[0] reco_fts.append(reco_out.cpu().detach()) input_fts = torch.cat(input_fts) reco_fts = torch.cat(reco_fts) plot_reco_difference( input_fts, reco_fts, model_fname, osp.join(save_dir, 'reconstruction_post_train', 'valid')) input_fts = [] reco_fts = [] for t in test_loader: model.eval() if isinstance(t, list): for d in t: input_fts.append(d.x) else: input_fts.append(t.x) t.to(device) reco_out = model(t) if isinstance(reco_out, tuple): reco_out = reco_out[0] reco_fts.append(reco_out.cpu().detach()) input_fts = torch.cat(input_fts) reco_fts = torch.cat(reco_fts) plot_reco_difference( input_fts, reco_fts, model_fname, osp.join(save_dir, 'reconstruction_post_train', 'test')) print('Completed')
def GCN(dataset, params, Epochs, MonteSize, width, lr, savepath): Batch_size = int(params[0]) for Monte_iter in range(MonteSize): # Data best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 or last checkpoint epoch TrainConvergence = [] TestConvergence = [] # model root = '/data/GraphData/' + dataset if dataset == 'Cora': model_name = "GCN3" datasetroot = Planetoid(root=root, name=dataset).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) model_to_save = './checkpoint/{}-{}-param_{}_{}-Mon_{}-ckpt.pth'.format( dataset, model_name, params[0], params[1], Monte_iter) if resume and os.path.exists(model_to_save): [net, TrainConvergence, TestConvergence, start_epoch] = ResumeModel(model_to_save) if start_epoch >= Epochs - 1: continue else: net = Net(datasetroot, width) elif dataset == 'ENZYMES' or dataset == 'MUTAG': model_name = "topk_pool_Net" root = '/data/GraphData' + dataset datasetroot = TUDataset(root, name=dataset) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) model_to_save = './checkpoint/{}-{}-param_{}_{}-Mon_{}-ckpt.pth'.format( dataset, model_name, params[0], params[1], Monte_iter) if resume and os.path.exists(model_to_save): [net, TrainConvergence, TestConvergence, start_epoch] = ResumeModel(model_to_save) if start_epoch >= Epochs - 1: continue else: net = topk_pool_Net(datasetroot, width) elif dataset == 'MNIST': datasetroot = MNISTSuperpixels(root='/data/GraphData/' + dataset, transform=T.Cartesian()).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) model_name = 'SPlineNet' model_to_save = './checkpoint/{}-{}-param_{}_{}-Mon_{}-ckpt.pth'.format( dataset, model_name, params[0], params[1], Monte_iter) if resume and os.path.exists(model_to_save): [net, TrainConvergence, TestConvergence, start_epoch] = ResumeModel(model_to_save) if start_epoch >= Epochs - 1: continue else: #net=Net(datasetroot,width) net = SPlineNet(datasetroot, width) elif dataset == 'CIFAR10': if resume and os.path.exists(model_to_save): [net, TrainConvergence, TestConvergence, start_epoch] = ResumeModel(model_to_save) if start_epoch >= Epochs - 1: continue else: net = getattr(CIFAR10_resnet, 'Resnet20_CIFAR10')(params[1]) else: raise Exception( "The dataset is:{}, it isn't existed.".format(dataset)) print('Let\'s use', torch.cuda.device_count(), 'GPUs!') torch.cuda.is_available() net = DataParallel(net) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') net = net.to(device) #cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) for epoch in range(start_epoch, start_epoch + Epochs): if epoch < Epochs: logging( 'Batch size: {},ConCoeff: {},MonteSize:{},epoch:{}'.format( params[0], params[1], Monte_iter, epoch)) TrainLoss = train(trainloader, net, optimizer, criterion) TrainConvergence.append(statistics.mean(TrainLoss)) TestConvergence.append( statistics.mean(test(testloader, net, criterion))) else: break if TestConvergence[epoch] < best_loss: logging('Saving..') state = { 'net': net.module, 'TrainConvergence': TrainConvergence, 'TestConvergence': TestConvergence, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, model_to_save) best_loss = TestConvergence[epoch] if not os.path.exists('./%s' % model_name): os.makedirs('./%s' % model_name) torch.save( net.module.state_dict(), './%s/%s_%s_%s_%s_%s_pretrain.pth' % (model_name, dataset, model_name, params[0], params[1], Epochs)) else: pass ## save recurrence plots if epoch % 20 == 0: save_recurrencePlots_file = "../Results/RecurrencePlots/RecurrencePlots_{}_{}_BatchSize{}_ConCoeffi{}_epoch{}.png".format( dataset, model_name, params[0], params[1], epoch) save_recurrencePlots(net, save_recurrencePlots_file) FileName = "{}-{}-param_{}_{}-monte_{}".format(dataset, model_name, params[0], params[1], Monte_iter) np.save(savepath + 'TrainConvergence-' + FileName, TrainConvergence) np.save(savepath + 'TestConvergence-' + FileName, TestConvergence) torch.cuda.empty_cache() print_nvidia_useage() if return_output == True: return TestConvergence[-1], net.module.fc.weight else: pass
def apply_dataparallel(model, cfgs): return DataParallel(model)
self.lin1 = torch.nn.Linear(64, 128) self.lin2 = torch.nn.Linear(128, dataset.num_classes) def forward(self, data): print('Inside Model: num graphs: {}, device: {}'.format( data.num_graphs, data.batch.device)) x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr x = F.elu(self.conv1(x, edge_index, edge_attr)) x = F.elu(self.conv2(x, edge_index, edge_attr)) x = global_mean_pool(x, data.batch) x = F.elu(self.lin1(x)) return F.log_softmax(self.lin2(x), dim=1) model = Net() print('Let\'s use', torch.cuda.device_count(), 'GPUs!') model = DataParallel(model) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) for data_list in loader: optimizer.zero_grad() output = model(data_list) print('Outside Model: num graphs: {}'.format(output.size(0))) y = torch.cat([data.y for data in data_list]).to(output.device) loss = F.nll_loss(output, y) loss.backward() optimizer.step()
id_euc = sphg(pos, 0.6, batch=batch, max_num_neighbors=16) x9 = self.conv3(x8, pos, B, N, id_euc) x10 = self.lin3(x9.view(B, N, -1)) x = x10.max(1)[0] # [B, C] return self.fc(x) # Train and test model = Net(train_dataset.num_classes) model = model.to(device) model.load_state_dict(torch.load('weight.pth', map_location=f'cuda:{device_list[0]}'), strict=True) if cuda: model = DataParallel(model, device_ids=device_list) optimizer = torch.optim.Adam([{ 'params': model.parameters(), 'initial_lr': base_lr }], lr=base_lr, weight_decay=1e-4) # optimizer.load_state_dict(torch.load('geocnn_optimizer.pt', map_location=f'cuda:{device_list[0]}').state_dict()) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epoch, eta_min=0.00001, last_epoch=-1) criterion = cal_loss def train(epoch):
parameters = list(model.parameters()) optimizer = torch.optim.Adam(params=parameters, lr=args.lr) total_params = sum(p.numel() for param in parameters for p in param) print(f'Total number of parameters is {total_params}') if args.model == 'DGCNN': print(f'SortPooling k is set to {model.k}') log_file = os.path.join(args.res_dir, 'log.txt') with open(log_file, 'a') as f: print(f'Total number of parameters is {total_params}', file=f) if args.model == 'DGCNN': print(f'SortPooling k is set to {model.k}', file=f) start_epoch = 1 if args.multi_gpu: model = DataParallel(model) model = model.to(device) if args.continue_from is not None: model.load_state_dict( torch.load( os.path.join( args.res_dir, 'model_checkpoint{}.pth'.format(args.continue_from)))) optimizer.load_state_dict( torch.load( os.path.join( args.res_dir, 'optimizer_checkpoint{}.pth'.format(args.continue_from)))) start_epoch = args.continue_from + 1 args.epochs -= args.continue_from
def __init__(self, option, model, train_dataset=None, valid_dataset=None, test_dataset=None, weight=[[1.0, 1.0]], tasks_num=1): self.option = option # self.tasks = ["MUV-466","MUV-548","MUV-600","MUV-644","MUV-652","MUV-689","MUV-692","MUV-712","MUV-713", # "MUV-733","MUV-737","MUV-810","MUV-832","MUV-846","MUV-852","MUV-858","MUV-859"] self.tasks_num = tasks_num self.save_path = self.option['exp_path'] self.device = torch.device("cuda:{}".format(0) \ if torch.cuda.is_available() and not option['cpu'] else "cpu") self.model = DataParallel(model).to(self.device) \ if option['parallel'] else model.to(self.device) #Setting the train valid and test data loader if train_dataset and valid_dataset: if self.option['parallel']: self.train_dataloader = DataListLoader(train_dataset, \ batch_size=self.option['batch_size'],shuffle=True) self.valid_dataloader = DataListLoader( valid_dataset, batch_size=self.option['batch_size']) if test_dataset: self.test_dataloader = DataListLoader( test_dataset, batch_size=self.option['batch_size']) else: self.train_dataloader = DataLoader(train_dataset, \ batch_size=self.option['batch_size'],shuffle=True,num_workers=4) self.valid_dataloader = DataLoader( valid_dataset, batch_size=self.option['batch_size'], num_workers=4) if test_dataset: self.test_dataloader = DataLoader( test_dataset, batch_size=self.option['batch_size'], num_workers=4) else: self.test_dataset = test_dataset if self.option['parallel']: self.test_dataloader = DataListLoader( test_dataset, batch_size=self.option['batch_size'], num_workers=0) else: self.test_dataloader = DataLoader( test_dataset, batch_size=self.option['batch_size'], num_workers=4) # Setting the Adam optimizer with hyper-param if not option['focalloss']: self.criterion = [ torch.nn.CrossEntropyLoss(torch.Tensor(w).to(self.device), reduction='mean') for w in weight ] else: self.log('Using FocalLoss') self.criterion = [FocalLoss(alpha=1 / w[0]) for w in weight] #alpha 0.965 self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.option['lr'], weight_decay=option['weight_decay']) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode='min', factor=0.7, patience=self.option['lr_scheduler_patience'], min_lr=1e-6) self.start = time.time() self.records = { 'best_epoch': None, 'val_auc': [], 'best_val_auc': 0., 'best_trn_auc': 0., 'best_test_auc': 0. } self.log( msgs=['\t{}:{}\n'.format(k, v) for k, v in self.option.items()], show=False) if train_dataset: self.log( 'train set num:{} valid set num:{} test set num: {}'. format(len(train_dataset), len(valid_dataset), len(test_dataset))) self.log("total parameters:" + str(sum([p.nelement() for p in self.model.parameters()]))) self.log(msgs=str(model).split('\n'), show=False)
def train_cross_validation(model_cls, dataset, num_clusters, dropout=0.0, lr=1e-4, weight_decay=1e-2, num_epochs=200, n_splits=10, use_gpu=True, dp=False, ddp=True, comment='', tb_service_loc='192.168.192.57:6006', batch_size=1, num_workers=0, pin_memory=False, cuda_device=None, fold_no=None, saved_model_path=None, device_ids=None, patience=50, seed=None, save_model=True, c_reg=0, base_log_dir='runs', base_model_save_dir='saved_models'): """ :param c_reg: :param save_model: bool :param seed: :param patience: for early stopping :param device_ids: for ddp :param saved_model_path: :param fold_no: :param ddp: DDP :param cuda_device: :param pin_memory: DataLoader args https://devblogs.nvidia.com/how-optimize-data-transfers-cuda-cc/ :param num_workers: DataLoader args :param model_cls: pytorch Module cls :param dataset: pytorch Dataset cls :param dropout: :param lr: :param weight_decay: :param num_epochs: :param n_splits: number of kFolds :param use_gpu: bool :param dp: bool :param comment: comment in the logs, to filter runs in tensorboard :param tb_service_loc: tensorboard service location :param batch_size: Dataset args not DataLoader :return: """ saved_args = locals() seed = int(time.time() % 1e4 * 1e5) if seed is None else seed saved_args['random_seed'] = seed torch.manual_seed(seed) np.random.seed(seed) if use_gpu: torch.cuda.manual_seed_all(seed) if ddp and not torch.distributed.is_initialized(): # initialize ddp dist.init_process_group('nccl', init_method='tcp://localhost:{}'.format( find_open_port()), world_size=1, rank=0) model_name = model_cls.__name__ if not cuda_device: if device_ids and (ddp or dp): device = device_ids[0] else: device = torch.device( 'cuda' if torch.cuda.is_available() and use_gpu else 'cpu') else: device = cuda_device device_count = torch.cuda.device_count() if dp else 1 device_count = len(device_ids) if (device_ids is not None and (dp or ddp)) else device_count if device_count > 1: print("Let's use", device_count, "GPUs!") # batch_size = batch_size * device_count log_dir_base = get_model_log_dir(comment, model_name) if tb_service_loc is not None: print("TensorBoard available at http://{1}/#scalars®exInput={0}". format(log_dir_base, tb_service_loc)) else: print("Please set up TensorBoard") criterion = nn.CrossEntropyLoss() # get test set folds = StratifiedKFold(n_splits=n_splits, shuffle=False) train_val_idx, test_idx = list( folds.split(np.zeros(len(dataset)), dataset.data.y.numpy()))[0] test_dataset = dataset.__indexing__(test_idx) train_val_dataset = dataset.__indexing__(train_val_idx) print("Training {0} {1} models for cross validation...".format( n_splits, model_name)) # folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0 folds = StratifiedKFold(n_splits=n_splits, shuffle=False) iter = folds.split(np.zeros(len(train_val_dataset)), train_val_dataset.data.y.numpy()) fold = 0 for train_idx, val_idx in tqdm_notebook(iter, desc='CV', leave=False): fold += 1 if fold_no is not None: if fold != fold_no: continue writer = SummaryWriter(log_dir=osp.join(base_log_dir, log_dir_base + str(fold))) model_save_dir = osp.join(base_model_save_dir, log_dir_base + str(fold)) print("creating dataloader tor fold {}".format(fold)) model = model_cls(writer, num_clusters=num_clusters, in_dim=dataset.data.x.shape[1], out_dim=int(dataset.data.y.max() + 1), dropout=dropout) # My Batch train_dataset = train_val_dataset.__indexing__(train_idx) val_dataset = train_val_dataset.__indexing__(val_idx) train_dataset = dataset_gather( train_dataset, seed=0, n_repeat=1, n_splits=int(len(train_dataset) / batch_size) + 1) val_dataset = dataset_gather( val_dataset, seed=0, n_repeat=1, n_splits=int(len(val_dataset) / batch_size) + 1) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=device_count, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=device_count, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) # if fold == 1 or fold_no is not None: print(model) writer.add_text('model_summary', model.__repr__()) writer.add_text('training_args', str(saved_args)) optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=weight_decay, amsgrad=False) # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay) if ddp: model = model.cuda() if device_ids is None else model.to( device_ids[0]) model = nn.parallel.DistributedDataParallel(model, device_ids=device_ids) elif dp and use_gpu: model = model.cuda() if device_ids is None else model.to( device_ids[0]) model = DataParallel(model, device_ids=device_ids) elif use_gpu: model = model.to(device) if saved_model_path is not None: model.load_state_dict(torch.load(saved_model_path)) best_map, patience_counter, best_score = 0.0, 0, -np.inf for epoch in tqdm_notebook(range(1, num_epochs + 1), desc='Epoch', leave=False): for phase in ['train', 'validation']: if phase == 'train': model.train() dataloader = train_dataloader else: model.eval() dataloader = val_dataloader # Logging running_total_loss = 0.0 running_corrects = 0 running_reg_loss = 0.0 running_nll_loss = 0.0 epoch_yhat_0, epoch_yhat_1 = torch.tensor([]), torch.tensor([]) epoch_label, epoch_predicted = torch.tensor([]), torch.tensor( []) for data_list in tqdm_notebook(dataloader, desc=phase, leave=False): # TODO: check devices if dp: data_list = to_cuda(data_list, (device_ids[0] if device_ids is not None else 'cuda')) y_hat, reg = model(data_list) # y_hat = y_hat.reshape(batch_size, -1) y = torch.tensor([], dtype=dataset.data.y.dtype, device=device) for data in data_list: y = torch.cat([y, data.y.view(-1).to(device)]) loss = criterion(y_hat, y) reg_loss = -reg total_loss = (loss + reg_loss * c_reg).sum() if phase == 'train': # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True)) optimizer.zero_grad() total_loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(model.parameters(), 2.0) optimizer.step() _, predicted = torch.max(y_hat, 1) label = y running_nll_loss += loss.item() running_total_loss += total_loss.item() running_reg_loss += reg.sum().item() running_corrects += (predicted == label).sum().item() epoch_yhat_0 = torch.cat( [epoch_yhat_0, y_hat[:, 0].detach().view(-1).cpu()]) epoch_yhat_1 = torch.cat( [epoch_yhat_1, y_hat[:, 1].detach().view(-1).cpu()]) epoch_label = torch.cat( [epoch_label, label.detach().cpu().float()]) epoch_predicted = torch.cat( [epoch_predicted, predicted.detach().cpu().float()]) # precision = sklearn.metrics.precision_score(epoch_label, epoch_predicted, average='micro') # recall = sklearn.metrics.recall_score(epoch_label, epoch_predicted, average='micro') # f1_score = sklearn.metrics.f1_score(epoch_label, epoch_predicted, average='micro') accuracy = sklearn.metrics.accuracy_score( epoch_label, epoch_predicted) epoch_total_loss = running_total_loss / dataloader.__len__() epoch_nll_loss = running_nll_loss / dataloader.__len__() epoch_reg_loss = running_reg_loss / dataloader.dataset.__len__( ) writer.add_scalars( 'nll_loss', {'{}_nll_loss'.format(phase): epoch_nll_loss}, epoch) writer.add_scalars('accuracy', {'{}_accuracy'.format(phase): accuracy}, epoch) # writer.add_scalars('{}_APRF'.format(phase), # { # 'accuracy': accuracy, # 'precision': precision, # 'recall': recall, # 'f1_score': f1_score # }, # epoch) if epoch_reg_loss != 0: writer.add_scalars( 'reg_loss'.format(phase), {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch) # writer.add_histogram('hist/{}_yhat_0'.format(phase), # epoch_yhat_0, # epoch) # writer.add_histogram('hist/{}_yhat_1'.format(phase), # epoch_yhat_1, # epoch) # Save Model & Early Stopping if phase == 'validation': model_save_path = model_save_dir + '-{}-{}-{:.3f}-{:.3f}'.format( model_name, epoch, accuracy, epoch_nll_loss) if accuracy > best_map: best_map = accuracy model_save_path = model_save_path + '-best' score = -epoch_nll_loss if score > best_score: patience_counter = 0 best_score = score else: patience_counter += 1 # skip 10 epoch # best_score = best_score if epoch > 10 else -np.inf if save_model: for th, pfix in zip( [0.8, 0.75, 0.7, 0.5, 0.0], ['-perfect', '-great', '-good', '-bad', '-miss']): if accuracy >= th: model_save_path += pfix break if epoch > 10: torch.save(model.state_dict(), model_save_path) writer.add_scalars('best_val_accuracy', {'{}_accuracy'.format(phase): best_map}, epoch) writer.add_scalars( 'best_nll_loss', {'{}_nll_loss'.format(phase): -best_score}, epoch) if patience_counter >= patience: print("Stopped at epoch {}".format(epoch)) return print("Done !")
def train_cummunity_detection(model_cls, dataset, dropout=0.0, lr=1e-3, weight_decay=1e-2, num_epochs=200, n_splits=10, use_gpu=True, dp=False, ddp=False, comment='', tb_service_loc='192.168.192.57:6006', batch_size=1, num_workers=0, pin_memory=False, cuda_device=None, ddp_port='23456', fold_no=None, device_ids=None, patience=20, seed=None, save_model=False, supervised=False): """ :param save_model: bool :param seed: :param patience: for early stopping :param device_ids: for ddp :param saved_model_path: :param fold_no: :param ddp_port: :param ddp: DDP :param cuda_device: :param pin_memory: DataLoader args https://devblogs.nvidia.com/how-optimize-data-transfers-cuda-cc/ :param num_workers: DataLoader args :param model_cls: pytorch Module cls :param dataset: pytorch Dataset cls :param dropout: :param lr: :param weight_decay: :param num_epochs: :param n_splits: number of kFolds :param use_gpu: bool :param dp: bool :param comment: comment in the logs, to filter runs in tensorboard :param tb_service_loc: tensorboard service location :param batch_size: Dataset args not DataLoader :return: """ saved_args = locals() seed = int(time.time() % 1e4 * 1e5) if seed is None else seed saved_args['random_seed'] = seed torch.manual_seed(seed) np.random.seed(seed) if use_gpu: torch.cuda.manual_seed_all(seed) if ddp and not torch.distributed.is_initialized(): # initialize ddp dist.init_process_group( 'nccl', init_method='tcp://localhost:{}'.format(ddp_port), world_size=1, rank=0) model_name = model_cls.__name__ if not cuda_device: if device_ids and (ddp or dp): device = device_ids[0] else: device = torch.device( 'cuda' if torch.cuda.is_available() and use_gpu else 'cpu') else: device = cuda_device device_count = torch.cuda.device_count() if dp else 1 device_count = len(device_ids) if (device_ids is not None and (dp or ddp)) else device_count if device_count > 1: print("Let's use", device_count, "GPUs!") # batch_size = batch_size * device_count log_dir_base = get_model_log_dir(comment, model_name) if tb_service_loc is not None: print("TensorBoard available at http://{1}/#scalars®exInput={0}". format(log_dir_base, tb_service_loc)) else: print("Please set up TensorBoard") print("Training {0} {1} models for cross validation...".format( n_splits, model_name)) folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0 print(dataset.__len__()) for train_idx, test_idx in tqdm_notebook(folds.split( list(range(dataset.__len__())), list(range(dataset.__len__()))), desc='models', leave=False): fold += 1 if fold_no is not None: if fold != fold_no: continue writer = SummaryWriter(log_dir=osp.join('runs', log_dir_base + str(fold))) model_save_dir = osp.join('saved_models', log_dir_base + str(fold)) print("creating dataloader tor fold {}".format(fold)) model = model_cls(writer, dropout=dropout) # My Batch train_dataset = dataset.__indexing__(train_idx) test_dataset = dataset.__indexing__(test_idx) train_dataset = dataset_gather(train_dataset, n_repeat=1, n_splits=int( len(train_dataset) / batch_size)) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=device_count, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=device_count, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) print(model) writer.add_text('model_summary', model.__repr__()) writer.add_text('training_args', str(saved_args)) optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=weight_decay, amsgrad=False) # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay) if ddp: model = model.cuda() if device_ids is None else model.to( device_ids[0]) model = nn.parallel.DistributedDataParallel(model, device_ids=device_ids) elif dp and use_gpu: model = model.cuda() if device_ids is None else model.to( device_ids[0]) model = DataParallel(model, device_ids=device_ids) elif use_gpu: model = model.to(device) for epoch in tqdm_notebook(range(1, num_epochs + 1), desc='Epoch', leave=False): for phase in ['train', 'validation']: if phase == 'train': model.train() dataloader = train_dataloader else: model.eval() dataloader = test_dataloader # Logging running_total_loss = 0.0 running_reg_loss = 0.0 running_overlap = 0.0 for data_list in tqdm_notebook(dataloader, desc=phase, leave=False): # TODO: check devices if dp: data_list = to_cuda(data_list, (device_ids[0] if device_ids is not None else 'cuda')) y_hat, reg = model(data_list) y = torch.tensor([], dtype=dataset.data.y.dtype, device=device) for data in data_list: y = torch.cat([y, data.y.view(-1).to(device)]) if supervised: loss = permutation_invariant_loss(y_hat, y) # criterion = nn.NLLLoss() # loss = criterion(y_hat, y) else: loss = -reg total_loss = loss if phase == 'train': # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True)) optimizer.zero_grad() total_loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(model.parameters(), 2.0) optimizer.step() _, predicted = torch.max(y_hat, 1) label = y if supervised: overlap_score = normalized_overlap( label.int().cpu().numpy(), predicted.int().cpu().numpy(), 0.25) # overlap_score = overlap(label.int().cpu().numpy(), predicted.int().cpu().numpy()) running_overlap += overlap_score print(reg, overlap_score, loss) running_total_loss += total_loss.item() running_reg_loss += reg.sum().item() epoch_total_loss = running_total_loss / dataloader.__len__() epoch_reg_loss = running_reg_loss / dataloader.dataset.__len__( ) if supervised: epoch_overlap = running_overlap / dataloader.__len__() writer.add_scalars( 'overlap'.format(phase), {'{}_overlap'.format(phase): epoch_overlap}, epoch) writer.add_scalars( 'reg_loss'.format(phase), {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch) print("Done !")
def TrainingNet(dataset,modelName,params,num_pre_epochs,num_epochs,NumCutoff,optimizerName,LinkPredictionMethod,MonteSize,savepath): Batch_size=params[0] VectorPairs=params[4] StartTopoCoeffi=params[5] WeightCorrectionCoeffi=params[6] interval=params[7] root='/git/data/GraphData/'+dataset TestAccs=[] for Monte_iter in range(MonteSize): # Data NewNetworkSizeAdjust=[] WeightsDynamicsEvolution=[] trainValRatio=[0.2,0.4] # model if dataset=='Cora' or dataset =='Citeseer' or dataset =='Pubmed': datasetroot= Planetoid(root=root, name=dataset,transform =T.NormalizeFeatures()).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) """ train_mask, val_mask,test_mask=DataSampler(trainValRatio,datasetroot.data.num_nodes) DataMask={} DataMask['train_mask']=train_mask DataMask['val_mask']=val_mask DataMask['test_mask']=test_mask trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)""" num_features=datasetroot.num_features num_classes=datasetroot.num_classes criterion = nn.CrossEntropyLoss() elif dataset =="CoraFull": datasetroot = CoraFull(root=root,transform =T.NormalizeFeatures()).shuffle() """train_mask, val_mask,test_mask=DataSampler(trainValRatio,datasetroot.data.num_nodes) DataMask={} DataMask['train_mask']=train_mask DataMask['val_mask']=val_mask DataMask['test_mask']=test_mask""" criterion = nn.CrossEntropyLoss() num_features=datasetroot.num_features num_classes=datasetroot.num_classes trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=False) elif dataset=='ENZYMES' or dataset=='MUTAG': datasetroot=TUDataset(root,name=dataset,use_node_attr=True) trainloader = DataLoader(datasetroot, batch_size=Batch_size, shuffle=True) num_features=datasetroot.num_features num_classes=datasetroot.num_classes elif dataset =="PPI": train_dataset = PPI(root, split='train') val_dataset = PPI(root, split='val') test_dataset = PPI(root, split='test') trainloader = DataListLoader(train_dataset, batch_size=Batch_size, shuffle=True) valloader = DataListLoader(val_dataset, batch_size=100, shuffle=False) testloader = DataListLoader(test_dataset, batch_size=100, shuffle=False) num_classes=train_dataset.num_classes num_features=train_dataset.num_features criterion = torch.nn.BCEWithLogitsLoss() elif dataset =="Reddit": datasetroot=Reddit(root) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=2, shuffle=False) elif dataset=="Amazon": datasetroot=Amazon(root, "Photo", transform=None, pre_transform=None) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) elif dataset=='MNIST': datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian()) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) elif dataset=='CIFAR10': pass else: raise Exception("Input wrong datatset!!") width=ContractionLayerCoefficients(num_features,*params[1:3]) net =ChooseModel(modelName,num_features,num_classes,width) FileName="{}-{}-param_{}_{}_{}_{}-monte_{}".format(dataset,modelName,interval,WeightCorrectionCoeffi,StartTopoCoeffi,VectorPairs,Monte_iter) print('Let\'s use', torch.cuda.device_count(), 'GPUs!') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') criterion = criterion.to(device) net = DataParallel(net) net = net.to(device) optimizer = getattr(optim,optimizerName)(net.parameters(), lr=params[3], momentum=0.9, weight_decay=5e-4) model_to_save='./checkpoint/{}-{}-param_{}_{}_{}_{}-ckpt.pth'.format(dataset,modelName,params[0],params[1],params[5],params[4]) if resume=="True" and os.path.exists(model_to_save): [net,optimizer,TrainConvergence,TestConvergence,Acc]=ResumeModel(net,optimizer,model_to_save) start_epoch=len(TrainConvergence) else: start_epoch = 0 # start from epoch 0 or last checkpoint epoch #cudnn.benchmark = True logging('dataset:{}, Batch size: {}, Number of layers:{} ConCoeff: {}, LR:{}, MonteSize:{}'.format(dataset, params[0], params[1],params[2],params[3],Monte_iter)) mark="{}{}Convergence/DiagElement-{}".format(savepath,dataset,FileName) markweights="{}{}Convergence/WeightChanges-{}".format(savepath,dataset,FileName) PreTrainConvergence,PreTestConvergence,PreAcc=TrainPart(start_epoch,num_pre_epochs,num_classes, trainloader,net,optimizer,criterion,NumCutoff,LinkPredictionMethod,VectorPairs,WeightCorrectionCoeffi,StartTopoCoeffi,mark,markweights,model_to_save,False) print('dataset: {}, model name:{}, epoch:{},Pre-train error:{}; Pre-test error:{}; test acc:{}'.format(dataset,modelName,num_pre_epochs,PreTrainConvergence[-1],PreTestConvergence[-1],PreAcc)) NewNetworksize=RetainNetworkSize(net,params[2]) OptimizedNet=ChooseModel(modelName,num_features,num_classes,NewNetworksize[0:-1]) NewNetworksize.insert(0,num_features) NewNetworkSizeAdjust.append(NewNetworksize[0:-1]) print(NewNetworkSizeAdjust) #OptimizedNet.apply(init_weights) OptimizedNet = DataParallel(OptimizedNet) OptimizedNet = OptimizedNet.to(device) cudnn.benchmark = True # Begin Pre training if optimizerName =="SGD": optimizerNew = getattr(optim,optimizerName)(OptimizedNet.parameters(), lr=params[3], momentum=0.9, weight_decay=5e-4) elif optimizerName =="Adam": optimizerNew = getattr(optim,optimizerName)(OptimizedNet.parameters(), lr=params[3], betas=(0.9, 0.999), eps=1e-08, weight_decay=5e-4, amsgrad=False) TrainConvergence,TestConvergence,TestAcc=TrainPart(start_epoch,num_epochs,datasetroot.num_classes, trainloader,OptimizedNet,optimizerNew,criterion, NumCutoff,LinkPredictionMethod,VectorPairs,WeightCorrectionCoeffi,StartTopoCoeffi,mark,markweights,model_to_save,True) np.save("{}/{}Convergence/AlgebraicConectivityTrainConvergence-{}".format(savepath,dataset,FileName),TrainConvergence) np.save("{}/{}Convergence/AlgebraicConectivityTestConvergence-{}".format(savepath,dataset,FileName),TestConvergence) #np.save("{}/{}Convergence/NewNetworkSizeAdjust-{}".format(savepath,dataset,FileName),NewNetworkSizeAdjust) #torch.cuda.empty_cache() print('dataset: {}, model name:{}, resized network size: {}, the train error: {},test error: {}, test acc:{}\n'.format(dataset,modelName,NewNetworksize[0:-1],num_epochs,TrainConvergence[-1],TestConvergence[-1],TestAcc)) np.save("{}/{}Convergence/AlgebraicConectivityMeanTestAccs-{}".format(savepath,dataset,FileName),TestAccs.append(TestAcc)) TestAccs.append(TestAcc) print_nvidia_useage()
y = torch.cat([data.y for data in data]).to(output.device) loss = F.nll_loss(output, y) epoch_test_loss += loss.detach().item() pred = output.max(dim=1)[1] correct += pred.eq(y).sum().item() return epoch_test_loss / len(test_dataset), correct / len(test_dataset) t0 = time.time() num_features = train_dataset.num_features n_classes = train_dataset.num_classes #model = GCN_PYG(num_features, 96, n_classes, 0) model = GAT_PYG(num_features, n_classes, 32, 8, 0.5) print('Let\'s use', torch.cuda.device_count(), 'GPUs!') model = DataParallel(model, [1]) device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay = 0) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=25, verbose=True) dur = [] for epoch in range(1, 201): t1 = time.time() train_loss, train_acc, optimizer= train_pyg('gcn', model, train_loader, device, optimizer) dur.append(time.time() - t1)
def test_model(model_path, dataset_param, model_param, test_dataset_param, batch_size=32, layer_num=None, metric_name_list=['relative_loss', 'mse_loss', 'accuracy', 'post_relative_loss', 'post_mse_loss', 'post_accuracy', 'layer_num',]): print(model_path) print(dataset_param, model_param, test_dataset_param) model_filename = osp.basename(model_path) new_log_dir = osp.dirname(model_path) log_dir_base = osp.basename(new_log_dir) log_dir = osp.dirname(osp.dirname(new_log_dir)) performance_dir = osp.join(log_dir, 'performance') if not osp.exists(performance_dir): os.mkdir(performance_dir) performance_dir = dataset_param2path(performance_dir, test_dataset_param) new_performance_dir = osp.join(performance_dir, log_dir_base) if not osp.exists(new_performance_dir): os.mkdir(new_performance_dir) batch_size = batch_size parallel_flag = False if torch.cuda.is_available(): device = torch.device('cuda') # parallel_flag = torch.cuda.device_count() > 1 else: device = torch.device('cpu') cur_model_path = os.path.join(new_log_dir, model_filename) cur_performance_dir = osp.join(new_performance_dir, model_filename) if not osp.exists(cur_performance_dir): os.mkdir(cur_performance_dir) if not all([osp.exists(osp.join(cur_performance_dir, ('raw_%s'%metric_name)+('' if layer_num is None else '_'+str(layer_num))+'.csv')) for metric_name in metric_name_list]): with torch.no_grad(): test_dataset = param2dataset(test_dataset_param, train_flag=False) data_loader_fn = DataListLoader if parallel_flag else DataLoader test_data_loader = data_loader_fn(test_dataset, batch_size) net = param2model(test_dataset, model_param) net = net.to(device) checkpoint = torch.load(cur_model_path, map_location=device) net.load_state_dict(checkpoint['model_state_dict']) if layer_num is not None: try: net.gnn_module.layer_num = layer_num except AttributeError: for i in range(len(net.gnn_module_list)): net.gnn_module_list[i].layer_num = layer_num if parallel_flag: net = DataParallel(net) test_metric_list, y_list = evaluate(net, test_data_loader, device, parallel_flag=parallel_flag, # post_processing_flag=False, metric_name_list=metric_name_list) for metric_name, metric_list in test_metric_list.items(): record = np.array([y_list, metric_list]) np.savetxt(osp.join(cur_performance_dir, ('raw_%s'%metric_name)+('' if layer_num is None else '_'+str(layer_num))+'.csv'), record) del test_dataset, test_data_loader
# gamma: original image ratio for sigma, gamma in tqdm( product( (0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 1, 2, 5, 10), (0, 0.1, 0.3, 0.5, 0.75, 0.9, 0.99), ) ): print( colorama.Fore.MAGENTA + "Testing config - sigma: %.3E, gamma: %.3E" % (sigma, gamma) ) model = NaiveBilateralFilter(fin=6, sigma=sigma, gamma=gamma, k=32) if parallel: model = DataParallel(model, device_ids=gpu_ids, output_device=gpu_id).to( device ) else: model = model.to(device) model.eval() total_mse, total_psnr, orig_psnr = evaluate(model, train_loader, 0) records[sigma][gamma] = (total_mse, total_psnr) if total_psnr > max_psnr: best_sigma, best_gamma = sigma, gamma max_psnr, min_mse = total_psnr, total_mse print( colorama.Fore.GREEN + "Max PSNR: %.3f, min MSE: %.3f, ORIG-MSE: %.3f@ sigma: %.3f, gamma: %.3f"
def test_data_parallel(): module = DataParallel(None) data_list = [Data(x=torch.randn(x, 1)) for x in [2, 3, 10, 4]] batches = module.scatter(data_list, device_ids=[0, 1, 0, 1]) assert len(batches) == 3
DataLoader(datasets[2][test_size:test_size + train_size], batch_size=graphsPerBatch) ] #print('Can we use GPU? ',torch.cuda.is_available()) # Select which GPUs we can see #os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #print(device.current_device()) myGCN = multiViewGCN(2, 4, device) # Use multiple GPUs if we can if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") myGCN = DataParallel(myGCN) optimizer = torch.optim.Adam(myGCN.parameters(), lr=0.0005) #, weight_decay=5e-4) nEpochs = 20 def train(): myGCN.train() loss_all = 0 loss_func = torch.nn.CrossEntropyLoss() for data0, data1, data2 in zip(train_loader[0], train_loader[1], train_loader[2]): data0 = data0.to(device)
3)).view(-1, 3) x1 = self.conv1(pos, batch) x2 = self.conv2(x1, batch) x3 = self.conv3(x2, batch) x4 = self.lin1(x3) x5 = global_max_pool(x4, batch) x6 = x5.repeat([1, 2048]).view(-1, 1024) x7 = torch.cat([x2, x3, x6], dim=1) out = self.mlp(x7) return F.log_softmax(out, dim=1) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Net(train_dataset.num_classes, k=30) model = DataParallel(model).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.8) def train(): model.train() total_loss = correct_nodes = total_nodes = 0 for i, data_list in enumerate(train_loader): optimizer.zero_grad() out = model(data_list) y = torch.cat([data.y for data in data_list]).to(device) loss = F.nll_loss(out, y) loss.backward() optimizer.step()
def train_cross_validation(model_cls, dataset, dropout=0.0, lr=1e-3, weight_decay=1e-2, num_epochs=200, n_splits=10, use_gpu=True, dp=False, ddp=False, comment='', tb_service_loc='192.168.192.57:6007', batch_size=1, num_workers=0, pin_memory=False, cuda_device=None, tb_dir='runs', model_save_dir='saved_models', res_save_dir='res', fold_no=None, saved_model_path=None, device_ids=None, patience=20, seed=None, fold_seed=None, save_model=False, is_reg=True, live_loss=True, domain_cls=True, final_cls=True): """ :type fold_seed: int :param live_loss: bool :param is_reg: bool :param save_model: bool :param seed: :param patience: for early stopping :param device_ids: for ddp :param saved_model_path: :param fold_no: int :param ddp_port: str :param ddp: DDP :param cuda_device: list of int :param pin_memory: bool, DataLoader args :param num_workers: int, DataLoader args :param model_cls: pytorch Module cls :param dataset: instance :param dropout: float :param lr: float :param weight_decay: :param num_epochs: :param n_splits: number of kFolds :param use_gpu: bool :param dp: bool :param comment: comment in the logs, to filter runs in tensorboard :param tb_service_loc: tensorboard service location :param batch_size: Dataset args not DataLoader :return: """ saved_args = locals() seed = int(time.time() % 1e4 * 1e5) if seed is None else seed saved_args['random_seed'] = seed torch.manual_seed(seed) np.random.seed(seed) if use_gpu: torch.cuda.manual_seed_all(seed) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False model_name = model_cls.__name__ if not cuda_device: if device_ids and dp: device = device_ids[0] else: device = torch.device( 'cuda' if torch.cuda.is_available() and use_gpu else 'cpu') else: device = cuda_device device_count = torch.cuda.device_count() if dp else 1 device_count = len(device_ids) if (device_ids is not None and dp) else device_count batch_size = batch_size * device_count # TensorBoard log_dir_base = get_model_log_dir(comment, model_name) if tb_service_loc is not None: print("TensorBoard available at http://{1}/#scalars®exInput={0}". format(log_dir_base, tb_service_loc)) else: print("Please set up TensorBoard") # model criterion = nn.NLLLoss() print("Training {0} {1} models for cross validation...".format( n_splits, model_name)) # 1 # folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0 # 2 # folds = GroupKFold(n_splits=n_splits) # iter = folds.split(np.zeros(len(dataset)), groups=dataset.data.site_id) # 4 # folds = StratifiedKFold(n_splits=n_splits, random_state=fold_seed, shuffle=True if fold_seed else False) # iter = folds.split(np.zeros(len(dataset)), dataset.data.y.numpy(), groups=dataset.data.subject_id) # 5 fold = 0 iter = multi_site_cv_split(dataset.data.y, dataset.data.site_id, dataset.data.subject_id, n_splits, random_state=fold_seed, shuffle=True if fold_seed else False) for train_idx, val_idx in tqdm_notebook(iter, desc='CV', leave=False): fold += 1 liveloss = PlotLosses() if live_loss else None # for a specific fold if fold_no is not None: if fold != fold_no: continue writer = SummaryWriter(log_dir=osp.join('runs', log_dir_base + str(fold))) model_save_dir = osp.join('saved_models', log_dir_base + str(fold)) print("creating dataloader tor fold {}".format(fold)) train_dataset, val_dataset = norm_train_val(dataset, train_idx, val_idx) model = model_cls(writer) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) if fold == 1 or fold_no is not None: print(model) writer.add_text('model_summary', model.__repr__()) writer.add_text('training_args', str(saved_args)) optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=weight_decay, amsgrad=False) # scheduler_reduce = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5) scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=5) # scheduler = scheduler_reduce # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay) if dp and use_gpu: model = model.cuda() if device_ids is None else model.to( device_ids[0]) model = DataParallel(model, device_ids=device_ids) elif use_gpu: model = model.to(device) if saved_model_path is not None: model.load_state_dict(torch.load(saved_model_path)) best_map, patience_counter, best_score = 0.0, 0, np.inf for epoch in tqdm_notebook(range(1, num_epochs + 1), desc='Epoch', leave=False): logs = {} # scheduler.step(epoch=epoch, metrics=best_score) for phase in ['train', 'validation']: if phase == 'train': model.train() dataloader = train_dataloader else: model.eval() dataloader = val_dataloader # Logging running_total_loss = 0.0 running_corrects = 0 running_reg_loss = 0.0 running_nll_loss = 0.0 epoch_yhat_0, epoch_yhat_1 = torch.tensor([]), torch.tensor([]) epoch_label, epoch_predicted = torch.tensor([]), torch.tensor( []) logging_hist = True if phase == 'train' else False # once per epoch for data_list in tqdm_notebook(dataloader, desc=phase, leave=False): # TODO: check devices if dp: data_list = to_cuda(data_list, (device_ids[0] if device_ids is not None else 'cuda')) y_hat, domain_yhat, reg = model(data_list) y = torch.tensor([], dtype=dataset.data.y.dtype, device=device) domain_y = torch.tensor([], dtype=dataset.data.site_id.dtype, device=device) for data in data_list: y = torch.cat([y, data.y.view(-1).to(device)]) domain_y = torch.cat( [domain_y, data.site_id.view(-1).to(device)]) loss = criterion(y_hat, y) domain_loss = criterion(domain_yhat, domain_y) # domain_loss = -1e-7 * domain_loss # print(domain_loss.item()) if domain_cls: total_loss = domain_loss _, predicted = torch.max(domain_yhat, 1) label = domain_y if final_cls: total_loss = loss _, predicted = torch.max(y_hat, 1) label = y if domain_cls and final_cls: total_loss = (loss + domain_loss).sum() _, predicted = torch.max(y_hat, 1) label = y if is_reg: total_loss += reg.sum() if phase == 'train': # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True)) optimizer.zero_grad() total_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 2.0) optimizer.step() running_nll_loss += loss.item() running_total_loss += total_loss.item() running_reg_loss += reg.sum().item() running_corrects += (predicted == label).sum().item() epoch_yhat_0 = torch.cat( [epoch_yhat_0, y_hat[:, 0].detach().view(-1).cpu()]) epoch_yhat_1 = torch.cat( [epoch_yhat_1, y_hat[:, 1].detach().view(-1).cpu()]) epoch_label = torch.cat( [epoch_label, label.detach().float().view(-1).cpu()]) epoch_predicted = torch.cat([ epoch_predicted, predicted.detach().float().view(-1).cpu() ]) # precision = sklearn.metrics.precision_score(epoch_label, epoch_predicted, average='micro') # recall = sklearn.metrics.recall_score(epoch_label, epoch_predicted, average='micro') # f1_score = sklearn.metrics.f1_score(epoch_label, epoch_predicted, average='micro') accuracy = sklearn.metrics.accuracy_score( epoch_label, epoch_predicted) epoch_total_loss = running_total_loss / dataloader.__len__() epoch_nll_loss = running_nll_loss / dataloader.__len__() epoch_reg_loss = running_reg_loss / dataloader.__len__() # print('epoch {} {}_nll_loss: {}'.format(epoch, phase, epoch_nll_loss)) writer.add_scalars( 'nll_loss', {'{}_nll_loss'.format(phase): epoch_nll_loss}, epoch) writer.add_scalars('accuracy', {'{}_accuracy'.format(phase): accuracy}, epoch) # writer.add_scalars('{}_APRF'.format(phase), # { # 'accuracy': accuracy, # 'precision': precision, # 'recall': recall, # 'f1_score': f1_score # }, # epoch) if epoch_reg_loss != 0: writer.add_scalars( 'reg_loss'.format(phase), {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch) # print(epoch_reg_loss) # writer.add_histogram('hist/{}_yhat_0'.format(phase), # epoch_yhat_0, # epoch) # writer.add_histogram('hist/{}_yhat_1'.format(phase), # epoch_yhat_1, # epoch) # Save Model & Early Stopping if phase == 'validation': model_save_path = model_save_dir + '-{}-{}-{:.3f}-{:.3f}'.format( model_name, epoch, accuracy, epoch_nll_loss) # best score if accuracy > best_map: best_map = accuracy model_save_path = model_save_path + '-best' score = epoch_nll_loss if score < best_score: patience_counter = 0 best_score = score else: patience_counter += 1 # skip first 10 epoch # best_score = best_score if epoch > 10 else -np.inf if save_model: for th, pfix in zip( [0.8, 0.75, 0.7, 0.5, 0.0], ['-perfect', '-great', '-good', '-bad', '-miss']): if accuracy >= th: model_save_path += pfix break torch.save(model.state_dict(), model_save_path) writer.add_scalars('best_val_accuracy', {'{}_accuracy'.format(phase): best_map}, epoch) writer.add_scalars( 'best_nll_loss', {'{}_nll_loss'.format(phase): best_score}, epoch) writer.add_scalars('learning_rate', { 'learning_rate': scheduler.optimizer.param_groups[0]['lr'] }, epoch) if patience_counter >= patience: print("Stopped at epoch {}".format(epoch)) return if live_loss: prefix = '' if phase == 'validation': prefix = 'val_' logs[prefix + 'log loss'] = epoch_nll_loss logs[prefix + 'accuracy'] = accuracy if live_loss: liveloss.update(logs) liveloss.draw() print("Done !")
def __init__(self, option, model, train_dataset, valid_dataset, test_dataset=None): self.option = option self.device = torch.device("cuda:{}".format(option['cuda_devices'][0]) \ if torch.cuda.is_available() else "cpu") self.model = DataParallel(model, device_ids=self.option['cuda_devices']).to(self.device) \ if option['parallel'] else model.to(self.device) # Setting the train valid and test data loader if self.option['parallel']: self.train_dataloader = DataListLoader(train_dataset, \ batch_size=self.option['train_batch']) self.valid_dataloader = DataListLoader(valid_dataset, batch_size=64) if test_dataset: self.test_dataloader = DataListLoader(test_dataset, batch_size=64) else: self.train_dataloader = DataLoader(train_dataset, \ batch_size=self.option['train_batch']) self.valid_dataloader = DataLoader(valid_dataset, batch_size=64) if test_dataset: self.test_dataloader = DataLoader(test_dataset, batch_size=64) # Setting the Adam optimizer with hyper-param self.criterion = torch.nn.L1Loss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.option['lr']) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode='min', factor=0.7, patience=self.option['lr_scheduler_patience'], min_lr=0.0000001) # other self.start = time.time() self.save_id = ''.join( random.sample('zyxwvutsrqponmlkjihgfedcba1234567890', 4)) self.abs_file_dir = os.path.dirname(os.path.abspath(__file__)) self.ckpt_save_dir = os.path.join( self.abs_file_dir, 'ckpt', 'ckpts_task{}_{}'.format(self.option['task'], self.save_id)) self.log_save_path = os.path.join( self.abs_file_dir, 'log', 'log_task{}_{}.txt'.format(self.option['task'], self.save_id)) self.record_save_path = os.path.join( self.abs_file_dir, 'record', 'record_task{}_{}.csv'.format(self.option['task'], self.save_id)) os.system('mkdir -p log record {}'.format(self.ckpt_save_dir)) self.records = { 'trn_record': [], 'val_record': [], 'val_losses': [], 'best_ckpt': None } self.log( msgs=['\t{}:{}\n'.format(k, v) for k, v in self.option.items()]) self.log('save id: {}'.format(self.save_id)) self.log('train set num:{} valid set num:{} test set num: {}'.format( len(train_dataset), len(valid_dataset), len(test_dataset))) self.log("Total Parameters:" + str(sum([p.nelement() for p in self.model.parameters()])))
def model_training(data_list_train, data_list_test, epochs, acc_epoch, acc_epoch2, save_model_epochs, validation_epoch, batchsize, logfilename, load_checkpoint= None): #logging logging.basicConfig(level=logging.DEBUG, filename='./logfiles/'+logfilename, filemode="w+", format="%(message)s") trainloader = DataListLoader(data_list_train, batch_size=batchsize, shuffle=True) testloader = DataListLoader(data_list_test, batch_size=batchsize, shuffle=True) device = torch.device('cuda') complete_net = completeNet() complete_net = DataParallel(complete_net) complete_net = complete_net.to(device) #train parameters weights = [10, 1] optimizer = torch.optim.Adam(complete_net.parameters(), lr=0.001, weight_decay=0.001) #resume training initial_epoch=1 if load_checkpoint!=None: checkpoint = torch.load(load_checkpoint) complete_net.load_state_dict(checkpoint['model_state_dict'], strict=False) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) initial_epoch = checkpoint['epoch']+1 loss = checkpoint['loss'] complete_net.train() for epoch in range(initial_epoch, epochs+1): epoch_total=0 epoch_total_ones= 0 epoch_total_zeros= 0 epoch_correct=0 epoch_correct_ones= 0 epoch_correct_zeros= 0 running_loss= 0 batches_num=0 for batch in trainloader: batch_total=0 batch_total_ones= 0 batch_total_zeros= 0 batch_correct= 0 batch_correct_ones= 0 batch_correct_zeros= 0 batches_num+=1 # Forward-Backpropagation output, output2, ground_truth, ground_truth2, det_num, tracklet_num= complete_net(batch) optimizer.zero_grad() loss = weighted_binary_cross_entropy(output, ground_truth, weights) loss.backward() optimizer.step() ##Accuracy if epoch%acc_epoch==0 and epoch!=0: # Hungarian method, clean up cleaned_output= hungarian(output2, ground_truth2, det_num, tracklet_num) batch_total += cleaned_output.size(0) ones= torch.tensor([1 for x in cleaned_output]).to(device) zeros = torch.tensor([0 for x in cleaned_output]).to(device) batch_total_ones += (cleaned_output == ones).sum().item() batch_total_zeros += (cleaned_output == zeros).sum().item() batch_correct += (cleaned_output == ground_truth2).sum().item() temp1 = (cleaned_output == ground_truth2) temp2 = (cleaned_output == ones) batch_correct_ones += (temp1 & temp2).sum().item() temp3 = (cleaned_output == zeros) batch_correct_zeros += (temp1 & temp3).sum().item() epoch_total += batch_total epoch_total_ones += batch_total_ones epoch_total_zeros += batch_total_zeros epoch_correct += batch_correct epoch_correct_ones += batch_correct_ones epoch_correct_zeros += batch_correct_zeros if loss.item()!=loss.item(): print("Error") break if batch_total_ones != 0 and batch_total_zeros != 0 and epoch%acc_epoch==0 and epoch!=0: print('Epoch: [%d] | Batch: [%d] | Training_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' % (epoch, batches_num, loss.item(), 100 * batch_correct / batch_total, 100 * batch_correct_ones / batch_total_ones, 100 * batch_correct_zeros / batch_total_zeros)) logging.info('Epoch: [%d] | Batch: [%d] | Training_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' % (epoch, batches_num, loss.item(), 100 * batch_correct / batch_total, 100 * batch_correct_ones / batch_total_ones, 100 * batch_correct_zeros / batch_total_zeros)) else: print('Epoch: [%d] | Batch: [%d] | Training_Loss: %.3f |' % (epoch, batches_num, loss.item())) logging.info('Epoch: [%d] | Batch: [%d] | Training_Loss: %.3f |' % (epoch, batches_num, loss.item())) running_loss += loss.item() if loss.item()!=loss.item(): print("Error") break if epoch_total_ones!=0 and epoch_total_zeros!=0 and epoch%acc_epoch==0 and epoch!=0: print('Epoch: [%d] | Training_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' % (epoch, running_loss / batches_num, 100 * epoch_correct / epoch_total, 100 * \ epoch_correct_ones / epoch_total_ones, 100 * epoch_correct_zeros / epoch_total_zeros)) logging.info('Epoch: [%d] | Training_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' % (epoch, running_loss / batches_num, 100 * epoch_correct / epoch_total, 100 * \ epoch_correct_ones / epoch_total_ones, 100 * epoch_correct_zeros / epoch_total_zeros)) else: print('Epoch: [%d] | Training_Loss: %.3f |' % (epoch, running_loss / batches_num)) logging.info('Epoch: [%d] | Training_Loss: %.3f |' % (epoch, running_loss / batches_num)) # save model if epoch%save_model_epochs==0 and epoch!=0: torch.save({ 'epoch': epoch, 'model_state_dict': complete_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': running_loss, }, './models/epoch_'+str(epoch)+'.pth') #validation if epoch%validation_epoch==0 and epoch!=0: with torch.no_grad(): epoch_total=0 epoch_total_ones= 0 epoch_total_zeros= 0 epoch_correct=0 epoch_correct_ones= 0 epoch_correct_zeros= 0 running_loss= 0 batches_num=0 for batch in testloader: batch_total=0 batch_total_ones= 0 batch_total_zeros= 0 batch_correct= 0 batch_correct_ones= 0 batch_correct_zeros= 0 batches_num+=1 output, output2, ground_truth, ground_truth2, det_num, tracklet_num = complete_net(batch) loss = weighted_binary_cross_entropy(output, ground_truth, weights) running_loss += loss.item() ##Accuracy if epoch%acc_epoch2==0 and epoch!=0: # Hungarian method, clean up cleaned_output= hungarian(output2, ground_truth2, det_num, tracklet_num) batch_total += cleaned_output.size(0) ones= torch.tensor([1 for x in cleaned_output]).to(device) zeros = torch.tensor([0 for x in cleaned_output]).to(device) batch_total_ones += (cleaned_output == ones).sum().item() batch_total_zeros += (cleaned_output == zeros).sum().item() batch_correct += (cleaned_output == ground_truth2).sum().item() temp1 = (cleaned_output == ground_truth2) temp2 = (cleaned_output == ones) batch_correct_ones += (temp1 & temp2).sum().item() temp3 = (cleaned_output == zeros) batch_correct_zeros += (temp1 & temp3).sum().item() epoch_total += batch_total epoch_total_ones += batch_total_ones epoch_total_zeros += batch_total_zeros epoch_correct += batch_correct epoch_correct_ones += batch_correct_ones epoch_correct_zeros += batch_correct_zeros if epoch_total_ones!=0 and epoch_total_zeros!=0 and epoch%acc_epoch2==0 and epoch!=0: print('Epoch: [%d] | Validation_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' % (epoch, running_loss / batches_num, 100 * epoch_correct / epoch_total, 100 * \ epoch_correct_ones / epoch_total_ones, 100 * epoch_correct_zeros / epoch_total_zeros)) logging.info('Epoch: [%d] | Validation_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' % (epoch, running_loss / batches_num, 100 * epoch_correct / epoch_total, 100 * \ epoch_correct_ones / epoch_total_ones, 100 * epoch_correct_zeros / epoch_total_zeros)) else: print('Epoch: [%d] | Validation_Loss: %.3f |' % (epoch, running_loss / batches_num)) logging.info('Epoch: [%d] | Validation_Loss: %.3f |' % (epoch, running_loss / batches_num))
def GCN(dataset, params, num_pre_epochs, num_epochs, MonteSize, PruningTimes, width, lr, savepath): Batch_size = int(params[0]) for Monte_iter in range(MonteSize): # Data best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 or last checkpoint epoch TrainConvergence = [] TestConvergence = [] # model root = '/git/data/GraphData/' + dataset if dataset == 'Cora': model_name = "PruningGCN" datasetroot = Planetoid(root=root, name=dataset).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) model_to_save = './checkpoint/{}-{}-param_{}_{}_{}_{}-Monte_{}-ckpt.pth'.format( dataset, model_name, params[0], params[1], params[2], params[3], Monte_iter) if Monte_iter == 0: if resume == True and os.path.exists(model_to_save): [ OptimizedNet, NewNetworksize, TrainConvergence, TestConvergence, start_epoch ] = ResumeModel(model_to_save) if start_epoch >= num_epochs - 1: continue else: net = Net(datasetroot, width) #net.apply(weight_reset) elif dataset == 'ENZYMES' or dataset == 'MUTAG': model_name = "topk_pool_Net" datasetroot = TUDataset(root, name=dataset) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) model_to_save = './checkpoint/{}-{}-param_{}_{}_{}_{}-ckpt.pth'.format( dataset, model_name, params[0], params[1], params[2], params[3]) if Monte_iter == 0: if resume == True and os.path.exists(model_to_save): [ OptimizedNet, NewNetworksize, TrainConvergence, TestConvergence, start_epoch ] = ResumeModel(model_to_save) if start_epoch >= num_epochs - 1: continue else: net = topk_pool_Net(datasetroot, width) elif dataset == 'MNIST': datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian()).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) model_name = 'SPlineNet' model_to_save = './checkpoint/{}-{}-param_{}_{}_{}_{}-Mon_{}-ckpt.pth'.format( dataset, model_name, params[0], params[1], params[2], params[3], Monte_iter) if resume == "True" and os.path.exists(model_to_save): [net, TrainConvergence, TestConvergence, start_epoch] = ResumeModel(model_to_save) if start_epoch >= num_epochs - 1: continue else: #net=Net(datasetroot,width) net = SPlineNet(datasetroot, width) elif dataset == 'CIFAR10': if resume == "True" and os.path.exists(model_to_save): [net, TrainConvergence, TestConvergence, start_epoch] = ResumeModel(model_to_save) if start_epoch >= num_epochs - 1: continue else: net = getattr(CIFAR10_resnet, 'Resnet20_CIFAR10')(params[1]) else: raise Exception( "The dataset is:{}, it isn't existed.".format(dataset)) if Monte_iter == 0 and start_epoch == 0: print('Let\'s use', torch.cuda.device_count(), 'GPUs!') net = DataParallel(net) device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') net = net.to(device) #cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) logging( 'Batch size: {}, Number of layers:{} ConCoeff: {}, CutoffCoffi:{}, MonteSize:{}' .format(params[0], params[1], params[2], params[3], Monte_iter)) for epoch in range(num_pre_epochs): PreTrainLoss = train(trainloader, net, optimizer, criterion) print('\nEpoch: {}, Average pre-tain loss: {:.4f} \n'.format( epoch, PreTrainLoss[0])) NewNetworksize = RetainNetworkSize(net, params[2]) del net #NewNetworksize=width for pruningIter in range(PruningTimes): if pruningIter > 0: [ OptimizedNet, NewNetworksize, TrainConvergence, TestConvergence, start_epoch ] = ResumeModel(model_to_save) elif dataset == 'Cora' and start_epoch == 0: OptimizedNet = Net(datasetroot, NewNetworksize[0:-1]) elif dataset == 'ENZYMES' and start_epoch == 0: NewNetworkSizeAdjust = NewNetworksize[0:-1] NewNetworkSizeAdjust[0] = width[0] - 1 OptimizedNet = topk_pool_Net(datasetroot, NewNetworkSizeAdjust) OptimizedNet = DataParallel(OptimizedNet) device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') OptimizedNet = OptimizedNet.to(device) cudnn.benchmark = True criterionNew = nn.CrossEntropyLoss() optimizerNew = optim.SGD(OptimizedNet.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) criterion = nn.CrossEntropyLoss() for epoch in range(start_epoch, num_epochs): TrainLoss = train(trainloader, OptimizedNet, optimizerNew, criterionNew) print('\n Epoch: {}, Average tain loss: {:.4f} \n'.format( epoch, TrainLoss[0])) TrainConvergence.append(statistics.mean(TrainLoss)) NewNetworksize = RetainNetworkSize(OptimizedNet, params[2]) TestConvergence.append( statistics.mean(test(testloader, OptimizedNet, criterion))) # save model if TestConvergence[epoch] < best_loss: logging('Saving..') state = { 'net': OptimizedNet.module, 'TrainConvergence': TrainConvergence, 'TestConvergence': TestConvergence, 'epoch': num_epochs, 'NewNetworksize': NewNetworksize[0:-1], } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, model_to_save) best_loss = TestConvergence[epoch] ## save recurrence plots """if epoch%20==0: save_recurrencePlots_file="../Results/RecurrencePlots/RecurrencePlots_{}_{}_BatchSize{} \_ConCoeffi{}_epoch{}.png".format(dataset, model_name,params[0],params[1],epoch) save_recurrencePlots(net,save_recurrencePlots_file)""" FileName = "{}-{}-param_{}_{}_{}_{}-monte_{}".format( dataset, model_name, params[0], params[1], params[2], params[3], Monte_iter) np.save(savepath + 'TrainConvergence-' + FileName, TrainConvergence) #np.save(savepath+'TestConvergence-'+FileName,TestConvergence) #torch.cuda.empty_cache() print_nvidia_useage() if return_output == True: return TestConvergence[-1], net.module.fc.weight else: pass