def train(self): logger.debug('starting training') train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') #Get start and end tokens train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, drop_last=False ) model = eval(self.model_conf.model_name)( self.config, train_dataset.n_letters, train_dataset.seq_len ) #move to gpu and parallelize if self.use_gpu: model = data_parallel.DataParallel(model, device_ids=self.gpus).to(self.device) model_params = filter(lambda p: p.requires_grad, model.parameters()) #Setup optimizer if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD( model_params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd ) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam( model_params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd ) else: raise ValueError("Non-supported optimizer!") # lr_scheduler = optim.lr_scheduler.MultiStepLR( # optimizer, # milestones=self.train_conf.lr_decay_epoch, # gamma=self.train_conf.lr_decay) # reset gradient # for i, p in enumerate(model.parameters()): # logger.info("{}: {}".format(i, p)) # print("-"*80) #criterion = nn.NLLLoss() criterion = nn.CrossEntropyLoss() # resume training resume_epoch = 0 if self.train_conf.is_resume: resume_epoch = self.train_conf.resume_epoch model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) obj = load_model( model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer ) if self.use_gpu: model.module = obj['model'] else: model = obj['model'] optimizer = obj['optimizer'] scheduler = obj['scheduler'] results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): model.train() train_iterator = train_loader.__iter__() if epoch == 0: iter_length = len(train_iterator) logger.debug("Length of train loader: {}".format(iter_length)) avg_train_loss = .0 iter_count = 0 for _, (inp, target, ext) in enumerate(train_iterator): model.module.zero_grad() optimizer.zero_grad() iter_count += 1 loss = .0 input_tensor = inp.pin_memory().to(0, non_blocking=True) target_tensor = target.pin_memory().to(0, non_blocking=True) ext_tensor = ext.pin_memory().to(0, non_blocking=True) hidden = torch.cat([model.module.initHidden().pin_memory().to(0,non_blocking=True) for _ in range(input_tensor.size(0))], dim=1) output, hidden = model(ext_tensor, input_tensor, hidden) for batch in range(output.size(0)): l = criterion(output[batch], target_tensor[batch]) loss += l avg_train_loss += float(loss.item()) / output.size(0) loss.backward() optimizer.step() #lr_scheduler.step() if iter_count % self.train_conf.display_iter == 0 and iter_count > 1: avg_train_loss /= self.train_conf.display_iter results['train_loss'] += [avg_train_loss] results['train_step'] += [iter_count] logger.info("Loss @ epoch {:04d} iteration {:08d} = {}".format(epoch + 1, iter_count, avg_train_loss)) #if iter_count % self.train_conf.display_code_iter == 0 and iter_count > 0: #Look at only the first one choice = random.choice(range(output.size(0))) file_type = self.file_ext[ext_tensor[choice].squeeze().detach().item()] target_char = self.tochar(target_tensor[choice]) predict_char = self.tochar(torch.argmax(output[choice], dim=1)) logger.info("Epoch {} Iter {} | Sample Start ----------------------".format(epoch, iter_count)) logger.info("File Type: {}".format(file_type)) logger.info("Predict: {}".format(''.join(predict_char))) logger.info("Target : {}".format(''.join(target_char))) logger.info("--------------------------------------------------------") #logger.info("output: {}".format(output[0])) # snapshot model if epoch % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module, optimizer, self.config, epoch + 1) pickle.dump(results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) return 1
def train(self): # create data loader train_dataset = Citation(self.dataset_conf.path, feat_dim_pca=self.model_conf.feat_dim, dataset_name=self.dataset_conf.name, split='train', train_ratio=self.dataset_conf.train_ratio, use_rand_split=self.dataset_conf.rand_split, seed=self.config.seed) val_dataset = Citation(self.dataset_conf.path, feat_dim_pca=self.model_conf.feat_dim, dataset_name=self.dataset_conf.name, split='val', train_ratio=self.dataset_conf.train_ratio, use_rand_split=self.dataset_conf.rand_split, seed=self.config.seed) train_loader = DataLoader(train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, drop_last=False) val_loader = DataLoader(val_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) # create optimizer params = model.parameters() if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_steps, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training if self.train_conf.is_resume: load_model(model, self.train_conf.resume_model, optimizer=optimizer) if self.use_gpu: model = nn.DataParallel(model, device_ids=self.gpus).cuda() # Training Loop iter_count = 0 best_val_acc = .0 results = defaultdict(list) for epoch in range(self.train_conf.max_epoch): # validation if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0: model.eval() val_loss = [] total, correct = .0, .0 for node_feat, node_label, edge, mask in val_loader: if self.use_gpu: node_feat, node_label, edge, mask = node_feat.cuda( ), node_label.cuda(), edge.cuda(), mask.cuda() node_feat, node_label, edge, mask = node_feat.float( ), node_label.long(), edge.long(), mask.byte() node_logit, node_label, _, curr_loss, _ = model( edge, node_feat, target=node_label, mask=mask) val_loss += [float(curr_loss.data.cpu().numpy())] _, predicted = torch.max(node_logit.data, 1) total += node_label.size(0) correct += predicted.eq( node_label.data).cpu().numpy().sum() val_loss = float(np.mean(val_loss)) val_acc = 100.0 * correct / total # save best model if val_acc > best_val_acc: best_val_acc = val_acc snapshot(model, optimizer, self.config, epoch + 1, tag='best') logger.info("Avg. Validation Loss = {}".format(val_loss)) logger.info("Validation Accuracy = {}".format(val_acc)) logger.info("Current Best Validation Accuracy = {}".format( best_val_acc)) results['val_loss'] += [val_loss] results['val_acc'] += [val_acc] model.train() # training lr_scheduler.step() for node_feat, node_label, edge, mask in train_loader: if self.use_gpu: node_feat, node_label, edge, mask = node_feat.cuda( ), node_label.cuda(), edge.cuda(), mask.cuda() node_feat, node_label, edge, mask = node_feat.float( ), node_label.long(), edge.long(), mask.byte() # optimizer.zero_grad() node_logit, _, diff_norm, train_loss, grad_w = model( edge, node_feat, target=node_label, mask=mask) # assign gradient for pp, ww in zip(model.parameters(), grad_w): pp.grad = ww optimizer.step() train_loss = float(train_loss.data.cpu().numpy()) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] # display loss if (iter_count + 1) % self.train_conf.display_iter == 0: logger.info( "Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count + 1, train_loss)) tmp_key = 'diff_norm_{}'.format(iter_count + 1) results[tmp_key] = diff_norm.data.cpu().numpy().tolist() iter_count += 1 # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1) results['best_val_acc'] += [best_val_acc] pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) return best_val_acc
def train(self): ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) if self.use_gpu: model = DataParallel(model, device_ids=self.gpus).to(self.device) # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_epoch, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training resume_epoch = 0 if self.train_conf.is_resume: model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) load_model(model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer, scheduler=lr_scheduler) resume_epoch = self.train_conf.resume_epoch # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): model.train() lr_scheduler.step() train_iterator = train_loader.__iter__() for inner_iter in range(len(train_loader) // self.num_gpus): optimizer.zero_grad() batch_data = [] if self.use_gpu: for _ in self.gpus: data = train_iterator.next() batch_data.append(data) iter_count += 1 avg_train_loss = .0 for ff in range(self.dataset_conf.num_fwd_pass): batch_fwd = [] if self.use_gpu: for dd, gpu_id in enumerate(self.gpus): data = {} data['adj'] = batch_data[dd][ff]['adj'].pin_memory( ).to(gpu_id, non_blocking=True) data['edges'] = batch_data[dd][ff][ 'edges'].pin_memory().to(gpu_id, non_blocking=True) data['node_idx_gnn'] = batch_data[dd][ff][ 'node_idx_gnn'].pin_memory().to( gpu_id, non_blocking=True) data['node_idx_feat'] = batch_data[dd][ff][ 'node_idx_feat'].pin_memory().to( gpu_id, non_blocking=True) data['label'] = batch_data[dd][ff][ 'label'].pin_memory().to(gpu_id, non_blocking=True) data['att_idx'] = batch_data[dd][ff][ 'att_idx'].pin_memory().to(gpu_id, non_blocking=True) data['subgraph_idx'] = batch_data[dd][ff][ 'subgraph_idx'].pin_memory().to( gpu_id, non_blocking=True) batch_fwd.append((data, )) if batch_fwd: train_loss = model(*batch_fwd).mean() avg_train_loss += train_loss # assign gradient train_loss.backward() # clip_grad_norm_(model.parameters(), 5.0e-0) optimizer.step() avg_train_loss /= float(self.dataset_conf.num_fwd_pass) # reduce train_loss = float(avg_train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: logger.info( "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count, train_loss)) # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, scheduler=lr_scheduler) pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1
def train(self): # create data loader train_dataset = BinaryMNIST(self.dataset_conf.path, num_imgs=self.dataset_conf.num_imgs, train=True, transform=transforms.ToTensor(), download=True) val_dataset = BinaryMNIST(self.dataset_conf.path, num_imgs=self.dataset_conf.num_imgs, train=False, transform=transforms.ToTensor(), download=True) train_loader = DataLoader(train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, drop_last=False) val_loader = DataLoader(val_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) # create optimizer params = model.parameters() if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_steps, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training if self.train_conf.is_resume: load_model(model, self.train_conf.resume_model, optimizer=optimizer) if self.use_gpu: model = nn.DataParallel(model, device_ids=self.gpus).cuda() # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(self.train_conf.max_epoch): # validation if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0: model.eval() val_loss = [] val_counter = 0 for imgs, labels in val_loader: if self.use_gpu: imgs, labels = imgs.cuda(), labels.cuda() imgs, labels = imgs.float(), labels.float() imgs_corrupt = self.rand_corrupt( imgs, corrupt_level=self.dataset_conf.corrupt_level) curr_loss, imgs_memory, _, _ = model(imgs_corrupt) img_recover = imgs_memory[-self.model_conf.input_dim:] img_recover_show = img_recover.clone().detach() img_recover_show.requires_grad = False img_recover_show[img_recover_show >= 0.5] = 1.0 img_recover_show[img_recover_show < 0.5] = 0.0 val_loss += [float(curr_loss.data.cpu().numpy())] val_counter += 1 val_loss = float(np.mean(val_loss)) logger.info("Avg. Validation Loss = {}".format( np.log10(val_loss))) results['val_loss'] += [val_loss] model.train() # training lr_scheduler.step() for imgs, labels in train_loader: if self.use_gpu: imgs, labels = imgs.cuda(), labels.cuda() imgs, labels = imgs.float(), labels.float() optimizer.zero_grad() train_loss, imgs_memory, diff_norm, grad = model(imgs) for pp, ww in zip(model.parameters(), grad): pp.grad = ww optimizer.step() train_loss = float(train_loss.data.cpu().numpy()) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] # display loss if iter_count % self.train_conf.display_iter == 0: logger.info( "Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count + 1, np.log10(train_loss))) tmp_key = 'diff_norm_{}'.format(iter_count + 1) results[tmp_key] = diff_norm iter_count += 1 # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1) pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
def train(self): # create data loader train_dataset = eval(self.dataset_conf.loader_name)( self.config, split='train') dev_dataset = eval(self.dataset_conf.loader_name)(self.config, split='dev') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) subset_indices = range(self.subsample_size) train_loader_sub = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False, sampler=SubsetRandomSampler(subset_indices)) dev_loader_sub = torch.utils.data.DataLoader( dev_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=dev_dataset.collate_fn, drop_last=False, sampler=SubsetRandomSampler(subset_indices)) # create models model = eval(self.model_conf.name)(self.model_conf) if self.use_gpu: model = nn.DataParallel(model, device_ids=self.gpus).cuda() # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD( params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam( params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False) lr_scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=self.warmup_setps) # reset gradient optimizer.zero_grad() # resume training or use prxetrained model if self.train_conf.is_resume: if self.train_conf.pretrain: model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device) model.load_state_dict(model_snapshot["model"],strict=False) model.to(self.device) else: model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device) model.load_state_dict(model_snapshot["model"],strict=True) model.to(self.device) # Training Loop num_train = len(train_dataset) iter_count = 0 best_val_loss = np.inf best_val_loss_test = np.inf best_win_pct_val = 0 best_win_pct_val_test = 0 results = defaultdict(list) for epoch in range(self.train_conf.max_epoch): # --------------------------------validation--------------------------------------------- if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0: #calculate validation loss model.eval() with torch.no_grad(): result_dataset_val = self.cal_dataset_loss(model,dev_loader_sub) if self.is_val: logger.info("-----------------Avg. Validation Loss = {:.4f}, " "NMLL = {:.4f}, NMLL_opt = {:.4f}, Win_pct = {:.2f}%, " "NMLL_test = {:.4f}, NMLL_test_opt = {:.4f}, " "Win_pct_test = {:.2f}%--------------------".format( result_dataset_val['loss'], result_dataset_val['nmll'], result_dataset_val['nmll_opt_sm'], result_dataset_val['win_pct_ai']*100, result_dataset_val['nmll_test'], result_dataset_val['nmll_opt_sm_test'], result_dataset_val['win_pct_ai_test']*100)) self.writer.add_scalar('nmll_opt_val', result_dataset_val['nmll_opt_sm'], iter_count) self.writer.add_scalar('nmll_opt_test_val', result_dataset_val['nmll_opt_sm_test'], iter_count) self.writer.add_scalar('win_pct_ai_val', result_dataset_val['win_pct_ai'], iter_count) self.writer.add_scalar('win_pct_ai_test_val', result_dataset_val['win_pct_ai_test'], iter_count) else: logger.info("-----------------Avg. Validation Loss = {:.4f}, " "NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, " "NMLL_test = {:.4f}, NMLL_test_orig = {:.4f}, " "Win_pct_test = {:.2f}%--------------------".format( result_dataset_val['loss'], result_dataset_val['nmll'], result_dataset_val['nmll_orig'], result_dataset_val['win_pct']*100, result_dataset_val['nmll_test'], result_dataset_val['nmll_test_orig'], result_dataset_val['win_pct_test']*100)) self.writer.add_scalar('val_loss', result_dataset_val['loss'], iter_count) self.writer.add_scalar('nmll_loss_val', result_dataset_val['nmll'], iter_count) self.writer.add_scalar('nmll_loss_orig_val', result_dataset_val['nmll_orig'], iter_count) self.writer.add_scalar('nmll_loss_test_val', result_dataset_val['nmll_test'], iter_count) self.writer.add_scalar('nmll_loss_test_orig_val', result_dataset_val['nmll_test_orig'], iter_count) self.writer.add_scalar('win_pct_val', result_dataset_val['win_pct'], iter_count) self.writer.add_scalar('win_pct_val_test', result_dataset_val['win_pct_test'], iter_count) results['val_loss'] += [result_dataset_val['loss']] results['nmll_loss_val'] += [result_dataset_val['nmll']] results['nmll_loss_orig_val'] += [result_dataset_val['nmll_orig']] results['nmll_loss_test_val'] += [result_dataset_val['nmll_test']] results['nmll_loss_test_orig_val'] += [result_dataset_val['nmll_test_orig']] results['win_pct_val'] += [result_dataset_val['win_pct']] results['win_pct_val_test'] += [result_dataset_val['win_pct_test']] # save best model if result_dataset_val['loss'] < best_val_loss: best_val_loss = result_dataset_val['loss'] best_val_loss_test = result_dataset_val['nmll_test'] if self.is_val: best_win_pct_val = result_dataset_val['win_pct_ai'] best_win_pct_val_test = result_dataset_val['win_pct_ai_test'] else: best_win_pct_val = result_dataset_val['win_pct'] best_win_pct_val_test = result_dataset_val['win_pct_test'] snapshot( model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='best') logger.info("Current Best Validation Loss = {:.4f}".format(best_val_loss)) # check early stop if early_stop.tick([result_dataset_val['loss']]): snapshot( model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='last') self.writer.close() break # --------------------------------------training----------------------------------- model.train() for data in train_loader: optimizer.zero_grad() if self.use_gpu: data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size'] = data_to_gpu( data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size']) if self.model_conf.name == 'GpSMDoubleAtt': mu, var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test']) elif self.model_conf.name == 'GpSMDoubleAttNoMu': var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test']) else: raise ValueError("No model of given name!") # print("Outside: input size", data['X_data'].shape, "output_size", nmll.shape) nmll_orig = data['nmll'] win_pct_train = torch.sum(nmll<nmll_orig+0.01).float()/nmll.shape[0] data_dim_vec = data['X_data_tr'].shape[-1] nmll_loss_train = torch.mean(nmll) train_loss = nmll_loss_train # calculate gradient train_loss.backward() nmll_loss_orig = torch.mean(nmll_orig) # calculate gradient norm grad_norm = 0 for p in model.parameters(): if p.requires_grad: param_norm = p.grad.data.norm() grad_norm += param_norm.item() ** 2 grad_norm = grad_norm ** (1./2) nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() train_loss = float(train_loss.data.cpu().numpy()) nmll_loss_train = float(nmll_loss_train.data.cpu().numpy()) nmll_loss_train_orig = float(nmll_loss_orig.data.cpu().numpy()) win_pct_train = float(win_pct_train.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) self.writer.add_scalar('nmll_loss_train', nmll_loss_train, iter_count) self.writer.add_scalar('nmll_loss_train_orig', nmll_loss_train_orig, iter_count) self.writer.add_scalar('win_pct_train', win_pct_train, iter_count) self.writer.add_scalar('grad_norm', grad_norm, iter_count) results['nmll_loss_train'] += [nmll_loss_train] results['nmll_loss_train_orig'] += [nmll_loss_train_orig] results['train_loss'] += [train_loss] results['win_pct_train'] += [win_pct_train] results['train_step'] += [iter_count] results['grad_norm'] += [grad_norm] # display loss if (iter_count + 1) % self.train_conf.display_iter == 0: logger.info("Loss @ epoch {:04d} iteration {:08d} = {:.4f}, NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, Grad_norm = {:.4f}, LR = {:.2e}".format( epoch + 1, iter_count + 1, train_loss, nmll_loss_train, nmll_loss_train_orig, win_pct_train*100, grad_norm, get_lr(optimizer))) iter_count += 1 # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1) lr_scheduler.step() #look at predictions, for debug purpose model.eval() with torch.no_grad(): results_sample_tr = self.cal_sample_result(model,train_loader_sub) results_sample_dev = self.cal_sample_result(model,dev_loader_sub) result_dataset_tr = self.cal_dataset_loss(model,train_loader_sub) result_dataset_dev = self.cal_dataset_loss(model,dev_loader_sub) train_loss = result_dataset_tr['loss'] results['best_val_loss'] = best_val_loss results['win_count_tr'] = results_sample_tr['win_pct'] results['win_count_dev'] = results_sample_dev['win_pct'] results['nmll_loss_sample_tr'] = results_sample_tr['nmll_loss_sample'] results['nmll_loss_sample_dev'] = results_sample_dev['nmll_loss_sample'] pickle.dump(results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() logger.info("Best Validation Loss = {:.4f}, " "Best Win_pct_val = {:.2f}%, " "Best Val Loss on Test = {:.4f}, " "Best Win_pct_val_test = {:.2f}%, " "Final Training NMLL = {:.4f}, " "Training NMLL original = {:.4f}, " "Win_pct_train = {:.2f}%, " "Final Dev NMLL = {:.4f}, " "Dev NMLL original = {:.4f}, " "Win_pct_dev = {:.2f}%, " "Final Dev Test NMLL = {:.4f}, " "Dev Test NMLL original = {:.4f}, " "Win_pct_test_dev = {:.2f}%.".format( best_val_loss, \ best_win_pct_val*100, \ best_val_loss_test, \ best_win_pct_val_test*100, \ result_dataset_tr['nmll'], \ result_dataset_tr['nmll_orig'], \ result_dataset_tr['win_pct']*100, \ result_dataset_dev['nmll'], \ result_dataset_dev['nmll_orig'], \ result_dataset_dev['win_pct']*100, \ result_dataset_dev['nmll_test'], \ result_dataset_dev['nmll_test_orig'], \ result_dataset_dev['win_pct_test']*100)) avg_nmll_tr = np.mean(results_sample_tr['nmll_sample_compare'],0) logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_tr['win_pct']*100)) logger.info('Average NMLL on training samples: true = {}, learned = {}'.format(avg_nmll_tr[1],avg_nmll_tr[0])) avg_nmll_dev = np.mean(results_sample_dev['nmll_sample_compare'],0) logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_dev['win_pct']*100)) logger.info('Average NMLL on testing samples: true = {}, learned = {}'.format(avg_nmll_dev[1],avg_nmll_dev[0])) snapshot( model.module if self.use_gpu else model, optimizer, self.config, self.train_conf.max_epoch + 1, tag='final') return None
def train(self): # create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, split='train') dev_dataset = eval(self.dataset_conf.loader_name)(self.config, split='dev') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=dev_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) if self.use_gpu: model = nn.DataParallel(model, device_ids=self.gpus).cuda() # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_steps, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training if self.train_conf.is_resume: load_model(model, self.train_conf.resume_model, optimizer=optimizer) # Training Loop iter_count = 0 best_val_loss = np.inf results = defaultdict(list) for epoch in range(self.train_conf.max_epoch): # validation if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0: model.eval() val_loss = [] for data in tqdm(dev_loader): if self.use_gpu: data['node_feat'], data['node_mask'], data[ 'label'] = data_to_gpu(data['node_feat'], data['node_mask'], data['label']) if self.model_conf.name == 'LanczosNet': data['L'], data['D'], data['V'] = data_to_gpu( data['L'], data['D'], data['V']) elif self.model_conf.name == 'GraphSAGE': data['nn_idx'], data[ 'nonempty_mask'] = data_to_gpu( data['nn_idx'], data['nonempty_mask']) elif self.model_conf.name == 'GPNN': data['L'], data['L_cluster'], data[ 'L_cut'] = data_to_gpu(data['L'], data['L_cluster'], data['L_cut']) else: data['L'] = data_to_gpu(data['L'])[0] with torch.no_grad(): if self.model_conf.name == 'AdaLanczosNet': pred, _ = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'LanczosNet': pred, _ = model(data['node_feat'], data['L'], data['D'], data['V'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GraphSAGE': pred, _ = model(data['node_feat'], data['nn_idx'], data['nonempty_mask'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GPNN': pred, _ = model(data['node_feat'], data['L'], data['L_cluster'], data['L_cut'], label=data['label'], mask=data['node_mask']) else: pred, _ = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) curr_loss = (pred - data['label'] ).abs().cpu().numpy() * self.const_factor val_loss += [curr_loss] val_loss = float(np.mean(np.concatenate(val_loss))) logger.info("Avg. Validation MAE = {}".format(val_loss)) self.writer.add_scalar('val_loss', val_loss, iter_count) results['val_loss'] += [val_loss] # save best model if val_loss < best_val_loss: best_val_loss = val_loss snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='best') logger.info( "Current Best Validation MAE = {}".format(best_val_loss)) # check early stop if early_stop.tick([val_loss]): snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='last') self.writer.close() break # training model.train() lr_scheduler.step() for data in train_loader: optimizer.zero_grad() if self.use_gpu: data['node_feat'], data['node_mask'], data[ 'label'] = data_to_gpu(data['node_feat'], data['node_mask'], data['label']) if self.model_conf.name == 'LanczosNet': data['L'], data['D'], data['V'] = data_to_gpu( data['L'], data['D'], data['V']) elif self.model_conf.name == 'GraphSAGE': data['nn_idx'], data['nonempty_mask'] = data_to_gpu( data['nn_idx'], data['nonempty_mask']) elif self.model_conf.name == 'GPNN': data['L'], data['L_cluster'], data[ 'L_cut'] = data_to_gpu(data['L'], data['L_cluster'], data['L_cut']) else: data['L'] = data_to_gpu(data['L'])[0] if self.model_conf.name == 'AdaLanczosNet': _, train_loss = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'LanczosNet': _, train_loss = model(data['node_feat'], data['L'], data['D'], data['V'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GraphSAGE': _, train_loss = model(data['node_feat'], data['nn_idx'], data['nonempty_mask'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GPNN': _, train_loss = model(data['node_feat'], data['L'], data['L_cluster'], data['L_cut'], label=data['label'], mask=data['node_mask']) else: _, train_loss = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) # assign gradient train_loss.backward() optimizer.step() train_loss = float(train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] # display loss if (iter_count + 1) % self.train_conf.display_iter == 0: logger.info( "Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count + 1, train_loss)) iter_count += 1 # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1) results['best_val_loss'] += [best_val_loss] pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() logger.info("Best Validation MAE = {}".format(best_val_loss)) return best_val_loss
def train(self): torch.autograd.set_detect_anomaly(True) ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, # true for grid num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) criterion = nn.BCEWithLogitsLoss() if self.use_gpu: model = DataParallel(model, device_ids=self.gpus).to(self.device) criterion = criterion.cuda() model.train() # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") # TODO: not used? early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_epoch, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() best_acc = 0. # resume training # TODO: record resume_epoch to the saved file resume_epoch = 0 if self.train_conf.is_resume: model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) load_model(model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer, scheduler=lr_scheduler) resume_epoch = self.train_conf.resume_epoch # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): model.train() train_iterator = train_loader.__iter__() avg_acc_whole_epoch = 0. cnt = 0. for inner_iter in range(len(train_loader) // self.num_gpus): optimizer.zero_grad() batch_data = [] if self.use_gpu: for _ in self.gpus: data = train_iterator.next() batch_data.append(data) iter_count += 1 avg_train_loss = .0 avg_acc = 0. for ff in range(self.dataset_conf.num_fwd_pass): batch_fwd = [] if self.use_gpu: for dd, gpu_id in enumerate(self.gpus): data = {} data['adj'] = batch_data[dd][ff]['adj'].pin_memory( ).to(gpu_id, non_blocking=True) data['edges'] = batch_data[dd][ff][ 'edges'].pin_memory().to(gpu_id, non_blocking=True) # data['node_idx_gnn'] = batch_data[dd][ff]['node_idx_gnn'].pin_memory().to(gpu_id, non_blocking=True) # data['node_idx_feat'] = batch_data[dd][ff]['node_idx_feat'].pin_memory().to(gpu_id, non_blocking=True) # data['label'] = batch_data[dd][ff]['label'].pin_memory().to(gpu_id, non_blocking=True) # data['att_idx'] = batch_data[dd][ff]['att_idx'].pin_memory().to(gpu_id, non_blocking=True) data['subgraph_idx'] = batch_data[dd][ff][ 'subgraph_idx'].pin_memory().to( gpu_id, non_blocking=True) data['complete_graph_label'] = batch_data[dd][ff][ 'complete_graph_label'].pin_memory().to( gpu_id, non_blocking=True) batch_fwd.append((data, )) pred = model(*batch_fwd) label = data['complete_graph_label'][:, None] train_loss = criterion(pred, label).mean() train_loss.backward() pred = (torch.sigmoid(pred) > 0.5).type_as(label) avg_acc += (pred.eq(label)).float().mean().item() avg_train_loss += train_loss.item() # assign gradient # clip_grad_norm_(model.parameters(), 5.0e-0) optimizer.step() lr_scheduler.step() avg_train_loss /= self.dataset_conf.num_fwd_pass # num_fwd_pass always 1 avg_acc /= self.dataset_conf.num_fwd_pass avg_acc_whole_epoch += avg_acc cnt += len(data['complete_graph_label']) # reduce self.writer.add_scalar('train_loss', avg_train_loss, iter_count) self.writer.add_scalar('train_acc', avg_acc, iter_count) results['train_loss'] += [avg_train_loss] results['train_acc'] += [avg_acc] results['train_step'] += [iter_count] # if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: # logger.info("NLL Loss @ epoch {:04d} iteration {:08d} = {}\tAcc = {}".format(epoch + 1, iter_count, train_loss, avg_acc)) avg_acc_whole_epoch /= cnt is_new_best = avg_acc_whole_epoch > best_acc if is_new_best: logger.info('!!! New best') best_acc = avg_acc_whole_epoch logger.info("Avg acc = {} @ epoch {:04d}".format( avg_acc_whole_epoch, epoch + 1)) # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0 or is_new_best: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, scheduler=lr_scheduler) pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1
def train(self): ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, # true for grid num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models # model = eval(self.model_conf.name)(self.config) args = self.config.model n_labels = self.dataset_conf.max_m + self.dataset_conf.max_n G = define_G(args.nz, args.ngf, args.netG, args.final_activation, args.norm_G) D = define_D(args.ndf, args.netD, norm=args.norm_D) ### define losses criterionGAN = GANLoss(args.gan_mode) rote_loss = nn.L1Loss(reduction='none') if args.sparsity > 0.: sparse_loss = nn.L1Loss() if self.use_gpu: # G = DataParallel(G).to(self.device) # D = DataParallel(D).to(self.device) G = G.cuda() D = D.cuda() criterionGAN = criterionGAN.to(self.device) rote_loss = rote_loss.cuda() if args.sparsity > 0.: sparse_loss = sparse_loss.cuda() G.train() D.train() # create optimizer G_params = filter(lambda p: p.requires_grad, G.parameters()) D_params = filter(lambda p: p.requires_grad, D.parameters()) optimizer_G = optim.Adam(G_params, lr=self.train_conf.lr, betas=(self.train_conf.beta1, 0.999)) optimizer_D = optim.Adam(D_params, lr=self.train_conf.lr, betas=(self.train_conf.beta1, 0.999)) fake_pool = ImagePool(args.pool_size) # resume training # TODO: record resume_epoch to the saved file resume_epoch = 0 if self.train_conf.is_resume: model_file_G = os.path.join(self.train_conf.resume_dir, 'G_' + self.train_conf.resume_model) model_file_D = os.path.join(self.train_conf.resume_dir, 'D_' + self.train_conf.resume_model) load_model(G, model_file_G, self.device, optimizer=optimizer_G) load_model(D, model_file_D, self.device, optimizer=optimizer_D) resume_epoch = int( osp.splitext(self.train_conf.resume_model)[0].split('_')[-1]) #original: self.train_conf.resume_epoch # Training Loop iter_count = 0 # iter idx thoughout the whole training results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): train_iterator = train_loader.__iter__() for batch_data in train_iterator: set_requires_grad(D, False) # set_requires_grad(G, True) optimizer_G.zero_grad() iter_count += 1 # assert in arg helper ff = 0 data = {} data['adj'] = batch_data[ff]['adj'].pin_memory().to( self.config.device, non_blocking=True) data['m'] = batch_data[ff]['m'].to(self.config.device, non_blocking=True) data['n'] = batch_data[ff]['n'].to(self.config.device, non_blocking=True) batch_size = data['adj'].size(0) i_onehot = torch.zeros( (batch_size, self.dataset_conf.max_m), requires_grad=True).pin_memory().to(self.config.device, non_blocking=True) i_onehot.scatter_(1, data['m'][:, None] - 1, 1) j_onehot = torch.zeros( (batch_size, self.dataset_conf.max_n), requires_grad=True).pin_memory().to(self.config.device, non_blocking=True) j_onehot.scatter_(1, data['n'][:, None] - 1, 1) y_onehot = torch.cat((i_onehot, j_onehot), dim=1) if args.nz > n_labels: noise = torch.randn( (batch_size, args.nz - n_labels, 1, 1), requires_grad=True).to(self.config.device, non_blocking=True) z_input = torch.cat( (y_onehot.view(batch_size, n_labels, 1, 1), noise), dim=1) else: z_input = y_onehot.view(batch_size, n_labels, 1, 1) output = G(z_input) # (B, 1, n, n) if self.model_conf.is_sym: output = torch.tril(output, diagonal=-1) output = output + output.transpose(2, 3) loss_G = 0. if args.sparsity > 0: loss_G_sparse = sparse_loss( output, torch.tensor(0.).expand_as(output).cuda()) loss_G += args.sparsity * loss_G_sparse if args.lambda_rote > 0: if args.final_activation == 'tanh': tmp_obj = (data['adj'] - 0.5) * 2 else: tmp_obj = data['adj'] loss_G_rote = rote_loss(output, tmp_obj) rote_mask = (loss_G_rote > 0.2).type_as(loss_G_rote) loss_G_rote = (loss_G_rote * rote_mask).mean() loss_G += args.lambda_rote * loss_G_rote # backward G loss_G_GAN = criterionGAN(D(output), True) loss_G += loss_G_GAN loss_G.backward() optimizer_G.step() # backward D set_requires_grad(D, True) # set_requires_grad(G, False) optimizer_D.zero_grad() real = data['adj'] if args.final_activation == 'sigmoid': ones_soft = torch.rand_like(real) * 0.1 + 0.9 zeros_soft = torch.rand_like(real) * 0.1 elif args.final_activation == 'tanh': ones_soft = torch.rand_like(real) * 0.2 + 0.8 zeros_soft = -(torch.rand_like(real) * 0.2 + 0.8) ones_mask = (real == 1.) zeros_mask = (real == 0.) real[ones_mask] = ones_soft[ones_mask] real[zeros_mask] = zeros_soft[zeros_mask] if self.model_conf.is_sym: real = torch.tril(real, diagonal=-1) real = real + real.transpose(2, 3) pred_real = D(real) loss_D_real = criterionGAN(pred_real, True) # Fake if args.pool_size: queried_fake = fake_pool.query(output.detach()) else: queried_fake = output.detach() pred_fake = D(queried_fake) loss_D_fake = criterionGAN(pred_fake, False) # Combined loss and calculate gradients loss_D = (loss_D_real + loss_D_fake) * 0.5 loss_D.backward() optimizer_D.step() # reduce self.writer.add_scalar('train_loss_G', loss_G.item(), iter_count) self.writer.add_scalar('train_loss_D', loss_D.item(), iter_count) results['train_loss_G'] += [loss_G] results['train_loss_D'] += [loss_D] results['train_step'] += [iter_count] if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: logger.info( "@ epoch {:04d} iter {:08d} loss_G: {:.5f}, loss_G_GAN: {:.5f}, loss_D: {:.5f}, loss_D_real: {:.5f}, loss_D_fake: {:.5f}" .format(epoch + 1, iter_count, loss_G.item(), loss_G_GAN.item(), loss_D.item(), loss_D_real.item(), loss_D_fake.item())) if args.lambda_rote > 0: logger.info( "@ epoch {:04d} iter {:08d} loss_rote: {:.5f}". format(epoch + 1, iter_count, loss_G_rote.item())) # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(G, optimizer_G, self.config, epoch + 1, fname_prefix='G_') snapshot(D, optimizer_G, self.config, epoch + 1, fname_prefix='D_') pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1
def train(self): ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) print('number of parameters : {}'.format( sum([np.prod(x.shape) for x in model.parameters()]))) if self.use_gpu: model = DataParallel(model, device_ids=self.gpus).to(self.device) # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False) from copy import deepcopy lr_scheduler = optim.lr_scheduler.MultiStepLR( deepcopy(optimizer), milestones=self.train_conf.lr_decay_epoch, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training resume_epoch = 0 if self.train_conf.is_resume: model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) load_model(model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer, scheduler=lr_scheduler) resume_epoch = self.train_conf.resume_epoch # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): has_sampled = False model.train() # lr_scheduler.step() train_iterator = train_loader.__iter__() for inner_iter in range(len(train_loader) // self.num_gpus): optimizer.zero_grad() batch_data = [] if self.use_gpu: for _ in self.gpus: data = train_iterator.next() batch_data.append(data) iter_count += 1 avg_train_loss = .0 for ff in range(self.dataset_conf.num_fwd_pass): batch_fwd = [] if self.use_gpu: for dd, gpu_id in enumerate(self.gpus): data = {} data['adj'] = batch_data[dd][ff]['adj'].pin_memory( ).to(gpu_id, non_blocking=True) data['edges'] = batch_data[dd][ff][ 'edges'].pin_memory().to(gpu_id, non_blocking=True) data['node_idx_gnn'] = batch_data[dd][ff][ 'node_idx_gnn'].pin_memory().to( gpu_id, non_blocking=True) data['node_idx_feat'] = batch_data[dd][ff][ 'node_idx_feat'].pin_memory().to( gpu_id, non_blocking=True) data['label'] = batch_data[dd][ff][ 'label'].pin_memory().to(gpu_id, non_blocking=True) data['att_idx'] = batch_data[dd][ff][ 'att_idx'].pin_memory().to(gpu_id, non_blocking=True) data['subgraph_idx'] = batch_data[dd][ff][ 'subgraph_idx'].pin_memory().to( gpu_id, non_blocking=True) batch_fwd.append((data, )) if batch_fwd: train_loss = model(*batch_fwd).mean() avg_train_loss += train_loss # assign gradient train_loss.backward() # clip_grad_norm_(model.parameters(), 5.0e-0) optimizer.step() avg_train_loss /= float(self.dataset_conf.num_fwd_pass) # reduce train_loss = float(avg_train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: logger.info( "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count, train_loss)) # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, scheduler=lr_scheduler) if (epoch + 1) % 20 == 0 and not has_sampled: has_sampled = True print('saving graphs') model.eval() graphs_gen = [ get_graph(aa.cpu().data.numpy()) for aa in model.module._sampling(10) ] model.train() vis_graphs = [] for gg in graphs_gen: CGs = [gg.subgraph(c) for c in nx.connected_components(gg)] CGs = sorted(CGs, key=lambda x: x.number_of_nodes(), reverse=True) vis_graphs += [CGs[0]] total = len(vis_graphs) #min(3, len(vis_graphs)) draw_graph_list(vis_graphs[:total], 2, int(total // 2), fname='sample/gran_%d.png' % epoch, layout='spring') pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1
def train(self): ### create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) # create models # model = eval(self.model_conf.name)(self.config) from model.transformer import make_model model = make_model(max_node=self.config.model.max_num_nodes, d_out=20, N=7, d_model=64, d_ff=64, dropout=0.4) # d_out, N, d_model, d_ff, h # d_out=20, N=15, d_model=16, d_ff=16, dropout=0.2) # d_out, N, d_model, d_ff, h # d_out=20, N=3, d_model=64, d_ff=64, dropout=0.1) # d_out, N, d_model, d_ff, h if self.use_gpu: model = DataParallel(model, device_ids=self.gpus).to(self.device) # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_epoch, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training resume_epoch = 0 if self.train_conf.is_resume: model_file = os.path.join(self.train_conf.resume_dir, self.train_conf.resume_model) load_model(model.module if self.use_gpu else model, model_file, self.device, optimizer=optimizer, scheduler=lr_scheduler) resume_epoch = self.train_conf.resume_epoch # Training Loop iter_count = 0 results = defaultdict(list) for epoch in range(resume_epoch, self.train_conf.max_epoch): model.train() lr_scheduler.step() train_iterator = train_loader.__iter__() for inner_iter in range(len(train_loader) // self.num_gpus): optimizer.zero_grad() batch_data = [] if self.use_gpu: for _ in self.gpus: data = train_iterator.next() batch_data += [data] avg_train_loss = .0 for ff in range(self.dataset_conf.num_fwd_pass): batch_fwd = [] if self.use_gpu: for dd, gpu_id in enumerate(self.gpus): data = batch_data[dd] adj, lens = data['adj'], data['lens'] # this is only for grid # adj = adj[:, :, :100, :100] # lens = [min(99, x) for x in lens] adj = adj.to('cuda:%d' % gpu_id) # build masks node_feat, attn_mask, lens = preprocess(adj, lens) batch_fwd.append( (node_feat, attn_mask.clone(), lens)) if batch_fwd: node_feat, attn_mask, lens = batch_fwd[0] log_theta, log_alpha = model(*batch_fwd) train_loss = model.module.mix_bern_loss( log_theta, log_alpha, adj, lens) avg_train_loss += train_loss # assign gradient train_loss.backward() # clip_grad_norm_(model.parameters(), 5.0e-0) optimizer.step() avg_train_loss /= float(self.dataset_conf.num_fwd_pass) # reduce train_loss = float(avg_train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] if iter_count % self.train_conf.display_iter == 0 or iter_count == 1: logger.info( "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count, train_loss)) if epoch % 50 == 0 and inner_iter == 0: model.eval() print('saving graphs') graphs_gen = [get_graph(adj[0].cpu().data.numpy())] + [ get_graph(aa.cpu().data.numpy()) for aa in model.module.sample( 19, max_node=self.config.model.max_num_nodes) ] model.train() vis_graphs = [] for gg in graphs_gen: CGs = [ gg.subgraph(c) for c in nx.connected_components(gg) ] CGs = sorted(CGs, key=lambda x: x.number_of_nodes(), reverse=True) try: vis_graphs += [CGs[0]] except: pass try: total = len(vis_graphs) #min(3, len(vis_graphs)) draw_graph_list(vis_graphs[:total], 4, int(total // 4), fname='sample/trans_sl:%d_%d.png' % (int(model.module.self_loop), epoch), layout='spring') except: print('sample saving failed') # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, scheduler=lr_scheduler) pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() return 1