def train(self): ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) if self.use_gpu: model = DataParallel(model, device_ids=self.gpus).to(self.device) # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_epoch, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training resume_epoch = 0 if self.train_conf.is_resume: model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) load_model(model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer, scheduler=lr_scheduler) resume_epoch = self.train_conf.resume_epoch # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): model.train() lr_scheduler.step() train_iterator = train_loader.__iter__() for inner_iter in range(len(train_loader) // self.num_gpus): optimizer.zero_grad() batch_data = [] if self.use_gpu: for _ in self.gpus: data = train_iterator.next() batch_data.append(data) iter_count += 1 avg_train_loss = .0 for ff in range(self.dataset_conf.num_fwd_pass): batch_fwd = [] if self.use_gpu: for dd, gpu_id in enumerate(self.gpus): data = {} data['adj'] = batch_data[dd][ff]['adj'].pin_memory( ).to(gpu_id, non_blocking=True) data['edges'] = batch_data[dd][ff][ 'edges'].pin_memory().to(gpu_id, non_blocking=True) data['node_idx_gnn'] = batch_data[dd][ff][ 'node_idx_gnn'].pin_memory().to( gpu_id, non_blocking=True) data['node_idx_feat'] = batch_data[dd][ff][ 'node_idx_feat'].pin_memory().to( gpu_id, non_blocking=True) data['label'] = batch_data[dd][ff][ 'label'].pin_memory().to(gpu_id, non_blocking=True) data['att_idx'] = batch_data[dd][ff][ 'att_idx'].pin_memory().to(gpu_id, non_blocking=True) data['subgraph_idx'] = batch_data[dd][ff][ 'subgraph_idx'].pin_memory().to( gpu_id, non_blocking=True) batch_fwd.append((data, )) if batch_fwd: train_loss = model(*batch_fwd).mean() avg_train_loss += train_loss # assign gradient train_loss.backward() # clip_grad_norm_(model.parameters(), 5.0e-0) optimizer.step() avg_train_loss /= float(self.dataset_conf.num_fwd_pass) # reduce train_loss = float(avg_train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: logger.info( "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count, train_loss)) # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, scheduler=lr_scheduler) pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1
def train(self): # create data loader train_dataset = eval(self.dataset_conf.loader_name)( self.config, split='train') dev_dataset = eval(self.dataset_conf.loader_name)(self.config, split='dev') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) subset_indices = range(self.subsample_size) train_loader_sub = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False, sampler=SubsetRandomSampler(subset_indices)) dev_loader_sub = torch.utils.data.DataLoader( dev_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=dev_dataset.collate_fn, drop_last=False, sampler=SubsetRandomSampler(subset_indices)) # create models model = eval(self.model_conf.name)(self.model_conf) if self.use_gpu: model = nn.DataParallel(model, device_ids=self.gpus).cuda() # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD( params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam( params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False) lr_scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=self.warmup_setps) # reset gradient optimizer.zero_grad() # resume training or use prxetrained model if self.train_conf.is_resume: if self.train_conf.pretrain: model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device) model.load_state_dict(model_snapshot["model"],strict=False) model.to(self.device) else: model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device) model.load_state_dict(model_snapshot["model"],strict=True) model.to(self.device) # Training Loop num_train = len(train_dataset) iter_count = 0 best_val_loss = np.inf best_val_loss_test = np.inf best_win_pct_val = 0 best_win_pct_val_test = 0 results = defaultdict(list) for epoch in range(self.train_conf.max_epoch): # --------------------------------validation--------------------------------------------- if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0: #calculate validation loss model.eval() with torch.no_grad(): result_dataset_val = self.cal_dataset_loss(model,dev_loader_sub) if self.is_val: logger.info("-----------------Avg. Validation Loss = {:.4f}, " "NMLL = {:.4f}, NMLL_opt = {:.4f}, Win_pct = {:.2f}%, " "NMLL_test = {:.4f}, NMLL_test_opt = {:.4f}, " "Win_pct_test = {:.2f}%--------------------".format( result_dataset_val['loss'], result_dataset_val['nmll'], result_dataset_val['nmll_opt_sm'], result_dataset_val['win_pct_ai']*100, result_dataset_val['nmll_test'], result_dataset_val['nmll_opt_sm_test'], result_dataset_val['win_pct_ai_test']*100)) self.writer.add_scalar('nmll_opt_val', result_dataset_val['nmll_opt_sm'], iter_count) self.writer.add_scalar('nmll_opt_test_val', result_dataset_val['nmll_opt_sm_test'], iter_count) self.writer.add_scalar('win_pct_ai_val', result_dataset_val['win_pct_ai'], iter_count) self.writer.add_scalar('win_pct_ai_test_val', result_dataset_val['win_pct_ai_test'], iter_count) else: logger.info("-----------------Avg. Validation Loss = {:.4f}, " "NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, " "NMLL_test = {:.4f}, NMLL_test_orig = {:.4f}, " "Win_pct_test = {:.2f}%--------------------".format( result_dataset_val['loss'], result_dataset_val['nmll'], result_dataset_val['nmll_orig'], result_dataset_val['win_pct']*100, result_dataset_val['nmll_test'], result_dataset_val['nmll_test_orig'], result_dataset_val['win_pct_test']*100)) self.writer.add_scalar('val_loss', result_dataset_val['loss'], iter_count) self.writer.add_scalar('nmll_loss_val', result_dataset_val['nmll'], iter_count) self.writer.add_scalar('nmll_loss_orig_val', result_dataset_val['nmll_orig'], iter_count) self.writer.add_scalar('nmll_loss_test_val', result_dataset_val['nmll_test'], iter_count) self.writer.add_scalar('nmll_loss_test_orig_val', result_dataset_val['nmll_test_orig'], iter_count) self.writer.add_scalar('win_pct_val', result_dataset_val['win_pct'], iter_count) self.writer.add_scalar('win_pct_val_test', result_dataset_val['win_pct_test'], iter_count) results['val_loss'] += [result_dataset_val['loss']] results['nmll_loss_val'] += [result_dataset_val['nmll']] results['nmll_loss_orig_val'] += [result_dataset_val['nmll_orig']] results['nmll_loss_test_val'] += [result_dataset_val['nmll_test']] results['nmll_loss_test_orig_val'] += [result_dataset_val['nmll_test_orig']] results['win_pct_val'] += [result_dataset_val['win_pct']] results['win_pct_val_test'] += [result_dataset_val['win_pct_test']] # save best model if result_dataset_val['loss'] < best_val_loss: best_val_loss = result_dataset_val['loss'] best_val_loss_test = result_dataset_val['nmll_test'] if self.is_val: best_win_pct_val = result_dataset_val['win_pct_ai'] best_win_pct_val_test = result_dataset_val['win_pct_ai_test'] else: best_win_pct_val = result_dataset_val['win_pct'] best_win_pct_val_test = result_dataset_val['win_pct_test'] snapshot( model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='best') logger.info("Current Best Validation Loss = {:.4f}".format(best_val_loss)) # check early stop if early_stop.tick([result_dataset_val['loss']]): snapshot( model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='last') self.writer.close() break # --------------------------------------training----------------------------------- model.train() for data in train_loader: optimizer.zero_grad() if self.use_gpu: data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size'] = data_to_gpu( data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size']) if self.model_conf.name == 'GpSMDoubleAtt': mu, var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test']) elif self.model_conf.name == 'GpSMDoubleAttNoMu': var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test']) else: raise ValueError("No model of given name!") # print("Outside: input size", data['X_data'].shape, "output_size", nmll.shape) nmll_orig = data['nmll'] win_pct_train = torch.sum(nmll<nmll_orig+0.01).float()/nmll.shape[0] data_dim_vec = data['X_data_tr'].shape[-1] nmll_loss_train = torch.mean(nmll) train_loss = nmll_loss_train # calculate gradient train_loss.backward() nmll_loss_orig = torch.mean(nmll_orig) # calculate gradient norm grad_norm = 0 for p in model.parameters(): if p.requires_grad: param_norm = p.grad.data.norm() grad_norm += param_norm.item() ** 2 grad_norm = grad_norm ** (1./2) nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() train_loss = float(train_loss.data.cpu().numpy()) nmll_loss_train = float(nmll_loss_train.data.cpu().numpy()) nmll_loss_train_orig = float(nmll_loss_orig.data.cpu().numpy()) win_pct_train = float(win_pct_train.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) self.writer.add_scalar('nmll_loss_train', nmll_loss_train, iter_count) self.writer.add_scalar('nmll_loss_train_orig', nmll_loss_train_orig, iter_count) self.writer.add_scalar('win_pct_train', win_pct_train, iter_count) self.writer.add_scalar('grad_norm', grad_norm, iter_count) results['nmll_loss_train'] += [nmll_loss_train] results['nmll_loss_train_orig'] += [nmll_loss_train_orig] results['train_loss'] += [train_loss] results['win_pct_train'] += [win_pct_train] results['train_step'] += [iter_count] results['grad_norm'] += [grad_norm] # display loss if (iter_count + 1) % self.train_conf.display_iter == 0: logger.info("Loss @ epoch {:04d} iteration {:08d} = {:.4f}, NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, Grad_norm = {:.4f}, LR = {:.2e}".format( epoch + 1, iter_count + 1, train_loss, nmll_loss_train, nmll_loss_train_orig, win_pct_train*100, grad_norm, get_lr(optimizer))) iter_count += 1 # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1) lr_scheduler.step() #look at predictions, for debug purpose model.eval() with torch.no_grad(): results_sample_tr = self.cal_sample_result(model,train_loader_sub) results_sample_dev = self.cal_sample_result(model,dev_loader_sub) result_dataset_tr = self.cal_dataset_loss(model,train_loader_sub) result_dataset_dev = self.cal_dataset_loss(model,dev_loader_sub) train_loss = result_dataset_tr['loss'] results['best_val_loss'] = best_val_loss results['win_count_tr'] = results_sample_tr['win_pct'] results['win_count_dev'] = results_sample_dev['win_pct'] results['nmll_loss_sample_tr'] = results_sample_tr['nmll_loss_sample'] results['nmll_loss_sample_dev'] = results_sample_dev['nmll_loss_sample'] pickle.dump(results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() logger.info("Best Validation Loss = {:.4f}, " "Best Win_pct_val = {:.2f}%, " "Best Val Loss on Test = {:.4f}, " "Best Win_pct_val_test = {:.2f}%, " "Final Training NMLL = {:.4f}, " "Training NMLL original = {:.4f}, " "Win_pct_train = {:.2f}%, " "Final Dev NMLL = {:.4f}, " "Dev NMLL original = {:.4f}, " "Win_pct_dev = {:.2f}%, " "Final Dev Test NMLL = {:.4f}, " "Dev Test NMLL original = {:.4f}, " "Win_pct_test_dev = {:.2f}%.".format( best_val_loss, \ best_win_pct_val*100, \ best_val_loss_test, \ best_win_pct_val_test*100, \ result_dataset_tr['nmll'], \ result_dataset_tr['nmll_orig'], \ result_dataset_tr['win_pct']*100, \ result_dataset_dev['nmll'], \ result_dataset_dev['nmll_orig'], \ result_dataset_dev['win_pct']*100, \ result_dataset_dev['nmll_test'], \ result_dataset_dev['nmll_test_orig'], \ result_dataset_dev['win_pct_test']*100)) avg_nmll_tr = np.mean(results_sample_tr['nmll_sample_compare'],0) logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_tr['win_pct']*100)) logger.info('Average NMLL on training samples: true = {}, learned = {}'.format(avg_nmll_tr[1],avg_nmll_tr[0])) avg_nmll_dev = np.mean(results_sample_dev['nmll_sample_compare'],0) logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_dev['win_pct']*100)) logger.info('Average NMLL on testing samples: true = {}, learned = {}'.format(avg_nmll_dev[1],avg_nmll_dev[0])) snapshot( model.module if self.use_gpu else model, optimizer, self.config, self.train_conf.max_epoch + 1, tag='final') return None
def train(self): # create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, split='train') dev_dataset = eval(self.dataset_conf.loader_name)(self.config, split='dev') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=dev_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) if self.use_gpu: model = nn.DataParallel(model, device_ids=self.gpus).cuda() # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_steps, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training if self.train_conf.is_resume: load_model(model, self.train_conf.resume_model, optimizer=optimizer) # Training Loop iter_count = 0 best_val_loss = np.inf results = defaultdict(list) for epoch in range(self.train_conf.max_epoch): # validation if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0: model.eval() val_loss = [] for data in tqdm(dev_loader): if self.use_gpu: data['node_feat'], data['node_mask'], data[ 'label'] = data_to_gpu(data['node_feat'], data['node_mask'], data['label']) if self.model_conf.name == 'LanczosNet': data['L'], data['D'], data['V'] = data_to_gpu( data['L'], data['D'], data['V']) elif self.model_conf.name == 'GraphSAGE': data['nn_idx'], data[ 'nonempty_mask'] = data_to_gpu( data['nn_idx'], data['nonempty_mask']) elif self.model_conf.name == 'GPNN': data['L'], data['L_cluster'], data[ 'L_cut'] = data_to_gpu(data['L'], data['L_cluster'], data['L_cut']) else: data['L'] = data_to_gpu(data['L'])[0] with torch.no_grad(): if self.model_conf.name == 'AdaLanczosNet': pred, _ = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'LanczosNet': pred, _ = model(data['node_feat'], data['L'], data['D'], data['V'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GraphSAGE': pred, _ = model(data['node_feat'], data['nn_idx'], data['nonempty_mask'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GPNN': pred, _ = model(data['node_feat'], data['L'], data['L_cluster'], data['L_cut'], label=data['label'], mask=data['node_mask']) else: pred, _ = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) curr_loss = (pred - data['label'] ).abs().cpu().numpy() * self.const_factor val_loss += [curr_loss] val_loss = float(np.mean(np.concatenate(val_loss))) logger.info("Avg. Validation MAE = {}".format(val_loss)) self.writer.add_scalar('val_loss', val_loss, iter_count) results['val_loss'] += [val_loss] # save best model if val_loss < best_val_loss: best_val_loss = val_loss snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='best') logger.info( "Current Best Validation MAE = {}".format(best_val_loss)) # check early stop if early_stop.tick([val_loss]): snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='last') self.writer.close() break # training model.train() lr_scheduler.step() for data in train_loader: optimizer.zero_grad() if self.use_gpu: data['node_feat'], data['node_mask'], data[ 'label'] = data_to_gpu(data['node_feat'], data['node_mask'], data['label']) if self.model_conf.name == 'LanczosNet': data['L'], data['D'], data['V'] = data_to_gpu( data['L'], data['D'], data['V']) elif self.model_conf.name == 'GraphSAGE': data['nn_idx'], data['nonempty_mask'] = data_to_gpu( data['nn_idx'], data['nonempty_mask']) elif self.model_conf.name == 'GPNN': data['L'], data['L_cluster'], data[ 'L_cut'] = data_to_gpu(data['L'], data['L_cluster'], data['L_cut']) else: data['L'] = data_to_gpu(data['L'])[0] if self.model_conf.name == 'AdaLanczosNet': _, train_loss = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'LanczosNet': _, train_loss = model(data['node_feat'], data['L'], data['D'], data['V'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GraphSAGE': _, train_loss = model(data['node_feat'], data['nn_idx'], data['nonempty_mask'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GPNN': _, train_loss = model(data['node_feat'], data['L'], data['L_cluster'], data['L_cut'], label=data['label'], mask=data['node_mask']) else: _, train_loss = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) # assign gradient train_loss.backward() optimizer.step() train_loss = float(train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] # display loss if (iter_count + 1) % self.train_conf.display_iter == 0: logger.info( "Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count + 1, train_loss)) iter_count += 1 # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1) results['best_val_loss'] += [best_val_loss] pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() logger.info("Best Validation MAE = {}".format(best_val_loss)) return best_val_loss
def train(self): torch.autograd.set_detect_anomaly(True) ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, # true for grid num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) criterion = nn.BCEWithLogitsLoss() if self.use_gpu: model = DataParallel(model, device_ids=self.gpus).to(self.device) criterion = criterion.cuda() model.train() # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") # TODO: not used? early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_epoch, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() best_acc = 0. # resume training # TODO: record resume_epoch to the saved file resume_epoch = 0 if self.train_conf.is_resume: model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) load_model(model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer, scheduler=lr_scheduler) resume_epoch = self.train_conf.resume_epoch # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): model.train() train_iterator = train_loader.__iter__() avg_acc_whole_epoch = 0. cnt = 0. for inner_iter in range(len(train_loader) // self.num_gpus): optimizer.zero_grad() batch_data = [] if self.use_gpu: for _ in self.gpus: data = train_iterator.next() batch_data.append(data) iter_count += 1 avg_train_loss = .0 avg_acc = 0. for ff in range(self.dataset_conf.num_fwd_pass): batch_fwd = [] if self.use_gpu: for dd, gpu_id in enumerate(self.gpus): data = {} data['adj'] = batch_data[dd][ff]['adj'].pin_memory( ).to(gpu_id, non_blocking=True) data['edges'] = batch_data[dd][ff][ 'edges'].pin_memory().to(gpu_id, non_blocking=True) # data['node_idx_gnn'] = batch_data[dd][ff]['node_idx_gnn'].pin_memory().to(gpu_id, non_blocking=True) # data['node_idx_feat'] = batch_data[dd][ff]['node_idx_feat'].pin_memory().to(gpu_id, non_blocking=True) # data['label'] = batch_data[dd][ff]['label'].pin_memory().to(gpu_id, non_blocking=True) # data['att_idx'] = batch_data[dd][ff]['att_idx'].pin_memory().to(gpu_id, non_blocking=True) data['subgraph_idx'] = batch_data[dd][ff][ 'subgraph_idx'].pin_memory().to( gpu_id, non_blocking=True) data['complete_graph_label'] = batch_data[dd][ff][ 'complete_graph_label'].pin_memory().to( gpu_id, non_blocking=True) batch_fwd.append((data, )) pred = model(*batch_fwd) label = data['complete_graph_label'][:, None] train_loss = criterion(pred, label).mean() train_loss.backward() pred = (torch.sigmoid(pred) > 0.5).type_as(label) avg_acc += (pred.eq(label)).float().mean().item() avg_train_loss += train_loss.item() # assign gradient # clip_grad_norm_(model.parameters(), 5.0e-0) optimizer.step() lr_scheduler.step() avg_train_loss /= self.dataset_conf.num_fwd_pass # num_fwd_pass always 1 avg_acc /= self.dataset_conf.num_fwd_pass avg_acc_whole_epoch += avg_acc cnt += len(data['complete_graph_label']) # reduce self.writer.add_scalar('train_loss', avg_train_loss, iter_count) self.writer.add_scalar('train_acc', avg_acc, iter_count) results['train_loss'] += [avg_train_loss] results['train_acc'] += [avg_acc] results['train_step'] += [iter_count] # if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: # logger.info("NLL Loss @ epoch {:04d} iteration {:08d} = {}\tAcc = {}".format(epoch + 1, iter_count, train_loss, avg_acc)) avg_acc_whole_epoch /= cnt is_new_best = avg_acc_whole_epoch > best_acc if is_new_best: logger.info('!!! New best') best_acc = avg_acc_whole_epoch logger.info("Avg acc = {} @ epoch {:04d}".format( avg_acc_whole_epoch, epoch + 1)) # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0 or is_new_best: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, scheduler=lr_scheduler) pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1
def train(self): ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) print('number of parameters : {}'.format( sum([np.prod(x.shape) for x in model.parameters()]))) if self.use_gpu: model = DataParallel(model, device_ids=self.gpus).to(self.device) # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False) from copy import deepcopy lr_scheduler = optim.lr_scheduler.MultiStepLR( deepcopy(optimizer), milestones=self.train_conf.lr_decay_epoch, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training resume_epoch = 0 if self.train_conf.is_resume: model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) load_model(model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer, scheduler=lr_scheduler) resume_epoch = self.train_conf.resume_epoch # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): has_sampled = False model.train() # lr_scheduler.step() train_iterator = train_loader.__iter__() for inner_iter in range(len(train_loader) // self.num_gpus): optimizer.zero_grad() batch_data = [] if self.use_gpu: for _ in self.gpus: data = train_iterator.next() batch_data.append(data) iter_count += 1 avg_train_loss = .0 for ff in range(self.dataset_conf.num_fwd_pass): batch_fwd = [] if self.use_gpu: for dd, gpu_id in enumerate(self.gpus): data = {} data['adj'] = batch_data[dd][ff]['adj'].pin_memory( ).to(gpu_id, non_blocking=True) data['edges'] = batch_data[dd][ff][ 'edges'].pin_memory().to(gpu_id, non_blocking=True) data['node_idx_gnn'] = batch_data[dd][ff][ 'node_idx_gnn'].pin_memory().to( gpu_id, non_blocking=True) data['node_idx_feat'] = batch_data[dd][ff][ 'node_idx_feat'].pin_memory().to( gpu_id, non_blocking=True) data['label'] = batch_data[dd][ff][ 'label'].pin_memory().to(gpu_id, non_blocking=True) data['att_idx'] = batch_data[dd][ff][ 'att_idx'].pin_memory().to(gpu_id, non_blocking=True) data['subgraph_idx'] = batch_data[dd][ff][ 'subgraph_idx'].pin_memory().to( gpu_id, non_blocking=True) batch_fwd.append((data, )) if batch_fwd: train_loss = model(*batch_fwd).mean() avg_train_loss += train_loss # assign gradient train_loss.backward() # clip_grad_norm_(model.parameters(), 5.0e-0) optimizer.step() avg_train_loss /= float(self.dataset_conf.num_fwd_pass) # reduce train_loss = float(avg_train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: logger.info( "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count, train_loss)) # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, scheduler=lr_scheduler) if (epoch + 1) % 20 == 0 and not has_sampled: has_sampled = True print('saving graphs') model.eval() graphs_gen = [ get_graph(aa.cpu().data.numpy()) for aa in model.module._sampling(10) ] model.train() vis_graphs = [] for gg in graphs_gen: CGs = [gg.subgraph(c) for c in nx.connected_components(gg)] CGs = sorted(CGs, key=lambda x: x.number_of_nodes(), reverse=True) vis_graphs += [CGs[0]] total = len(vis_graphs) #min(3, len(vis_graphs)) draw_graph_list(vis_graphs[:total], 2, int(total // 2), fname='sample/gran_%d.png' % epoch, layout='spring') pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1
def train(self): ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models # model = eval(self.model_conf.name)(self.config) from model.transformer import make_model model = make_model(max_node=self.config.model.max_num_nodes, d_out=20, N=7, d_model=64, d_ff=64, dropout=0.4) # d_out, N, d_model, d_ff, h # d_out=20, N=15, d_model=16, d_ff=16, dropout=0.2) # d_out, N, d_model, d_ff, h # d_out=20, N=3, d_model=64, d_ff=64, dropout=0.1) # d_out, N, d_model, d_ff, h if self.use_gpu: model = DataParallel(model, device_ids=self.gpus).to(self.device) # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_epoch, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training resume_epoch = 0 if self.train_conf.is_resume: model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) load_model(model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer, scheduler=lr_scheduler) resume_epoch = self.train_conf.resume_epoch # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): model.train() lr_scheduler.step() train_iterator = train_loader.__iter__() for inner_iter in range(len(train_loader) // self.num_gpus): optimizer.zero_grad() batch_data = [] if self.use_gpu: for _ in self.gpus: data = train_iterator.next() batch_data += [data] avg_train_loss = .0 for ff in range(self.dataset_conf.num_fwd_pass): batch_fwd = [] if self.use_gpu: for dd, gpu_id in enumerate(self.gpus): data = batch_data[dd] adj, lens = data['adj'], data['lens'] # this is only for grid # adj = adj[:, :, :100, :100] # lens = [min(99, x) for x in lens] adj = adj.to('cuda:%d' % gpu_id) # build masks node_feat, attn_mask, lens = preprocess(adj, lens) batch_fwd.append( (node_feat, attn_mask.clone(), lens)) if batch_fwd: node_feat, attn_mask, lens = batch_fwd[0] log_theta, log_alpha = model(*batch_fwd) train_loss = model.module.mix_bern_loss( log_theta, log_alpha, adj, lens) avg_train_loss += train_loss # assign gradient train_loss.backward() # clip_grad_norm_(model.parameters(), 5.0e-0) optimizer.step() avg_train_loss /= float(self.dataset_conf.num_fwd_pass) # reduce train_loss = float(avg_train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: logger.info( "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count, train_loss)) if epoch % 50 == 0 and inner_iter == 0: model.eval() print('saving graphs') graphs_gen = [get_graph(adj[0].cpu().data.numpy())] + [ get_graph(aa.cpu().data.numpy()) for aa in model.module.sample( 19, max_node=self.config.model.max_num_nodes) ] model.train() vis_graphs = [] for gg in graphs_gen: CGs = [ gg.subgraph(c) for c in nx.connected_components(gg) ] CGs = sorted(CGs, key=lambda x: x.number_of_nodes(), reverse=True) try: vis_graphs += [CGs[0]] except: pass try: total = len(vis_graphs) #min(3, len(vis_graphs)) draw_graph_list(vis_graphs[:total], 4, int(total // 4), fname='sample/trans_sl:%d_%d.png' % (int(model.module.self_loop), epoch), layout='spring') except: print('sample saving failed') # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, scheduler=lr_scheduler) pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1