def construct_model(self): """get data loader""" input_size, input_channels, n_classes, train_data = get_data( self.config.dataset, self.config.data_path, cutout_length=0, validation=False) n_train = len(train_data) split = n_train // 2 indices = list(range(n_train)) train_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[:split]) valid_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[split:]) self.train_loader = torch.utils.data.DataLoader( train_data, batch_size=self.config.batch_size, sampler=train_sampler, num_workers=self.config.workers, pin_memory=True) self.valid_loader = torch.utils.data.DataLoader( train_data, batch_size=self.config.batch_size, sampler=valid_sampler, num_workers=self.config.workers, pin_memory=True) """build model""" print("init model") self.criterion = nn.CrossEntropyLoss().to(self.device) model = SearchStageController(input_channels, self.config.init_channels, n_classes, self.config.layers, self.criterion, self.config.genotype, device_ids=self.config.gpus) self.model = model.to(self.device) print("init model end!") """build optimizer""" print("get optimizer") self.w_optim = torch.optim.SGD(self.model.weights(), self.config.w_lr, momentum=self.config.w_momentum, weight_decay=self.config.w_weight_decay) self.alpha_optim = torch.optim.Adam( self.model.alphas(), self.config.alpha_lr, betas=(0.5, 0.999), weight_decay=self.config.alpha_weight_decay) self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self.w_optim, self.total_epochs, eta_min=self.config.w_lr_min) self.architect = Architect(self.model, self.config.w_momentum, self.config.w_weight_decay)
def __init__(self, args): self.args = args self.console = Console() self.console.log('=> [1] Initial settings') np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True cudnn.enabled = True self.console.log('=> [2] Initial models') self.metric = load_metric(args) self.loss_fn = get_loss_fn(args).cuda() self.model = Model_Search(args, get_trans_input(args), self.loss_fn).cuda() self.console.log( f'=> Supernet Parameters: {count_parameters_in_MB(self.model)}', style='bold red') self.console.log(f'=> [3] Preparing dataset') self.dataset = load_data(args) if args.pos_encode > 0: #! add positional encoding self.console.log(f'==> [3.1] Adding positional encodings') self.dataset._add_positional_encodings(args.pos_encode) self.search_data = self.dataset.train self.val_data = self.dataset.val self.test_data = self.dataset.test self.load_dataloader() self.console.log(f'=> [4] Initial optimizer') self.optimizer = torch.optim.SGD(params=self.model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer=self.optimizer, T_max=float(args.epochs), eta_min=args.lr_min) self.architect = Architect(self.model, self.args)
def _build_model(self): model_dict = { 'informer': Informer, 'informerstack': InformerStack, } if self.args.model == 'informer' or self.args.model == 'informerstack': e_layers = self.args.e_layers if self.args.model == 'informer' else self.args.s_layers model = model_dict[self.args.model]( self.args.enc_in, self.args.dec_in, self.args.c_out, self.args.seq_len, self.args.label_len, self.args.pred_len, self.args.factor, self.args.d_model, self.args.n_heads, e_layers, # self.args.e_layers, self.args.d_layers, self.args.d_ff, self.args.dropout, self.args.attn, self.args.embed, self.args.freq, self.args.activation, self.args.output_attention, self.args.distil, self.args.mix, self.device, self.args).float() else: raise NotImplementedError # something self.arch = Architect(model, self.device, self.args, self._select_criterion()) return model
class SearchCellTrainer(): def __init__(self, config): self.config = config self.world_size = 1 self.gpu = self.config.local_rank self.save_epoch = 1 self.ckpt_path = self.config.path """get the train parameters""" self.total_epochs = self.config.epochs self.train_batch_size = self.config.batch_size self.val_batch_size = self.config.batch_size self.global_batch_size = self.world_size * self.train_batch_size self.max_lr = self.config.w_lr * self.world_size """construct the whole network""" self.resume_path = self.config.resume_path if torch.cuda.is_available(): # self.device = torch.device(f'cuda:{self.gpu}') # torch.cuda.set_device(self.device) torch.cuda.set_device(self.config.gpus[0]) # cudnn.benchmark = True self.device = torch.device('cuda') else: self.device = torch.device('cpu') self.construct_model() self.steps = 0 self.log_step = 10 self.logger = self.config.logger self.writer = SummaryWriter( log_dir=os.path.join(self.config.path, "tb")) self.writer.add_text('config', config.as_markdown(), 0) def construct_model(self): """get data loader""" input_size, input_channels, n_classes, train_data = get_data( self.config.dataset, self.config.data_path, cutout_length=0, validation=False) n_train = len(train_data) split = n_train // 2 indices = list(range(n_train)) train_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[:split]) valid_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[split:]) self.train_loader = torch.utils.data.DataLoader( train_data, batch_size=self.config.batch_size, sampler=train_sampler, num_workers=self.config.workers, pin_memory=True) self.valid_loader = torch.utils.data.DataLoader( train_data, batch_size=self.config.batch_size, sampler=valid_sampler, num_workers=self.config.workers, pin_memory=True) """build model""" print("init model") self.criterion = nn.CrossEntropyLoss().to(self.device) model = SearchCellController(input_channels, self.config.init_channels, n_classes, self.config.layers, self.criterion, device_ids=self.config.gpus) self.model = model.to(self.device) print("init model end!") """build optimizer""" print("get optimizer") self.w_optim = torch.optim.SGD(self.model.weights(), self.config.w_lr, momentum=self.config.w_momentum, weight_decay=self.config.w_weight_decay) self.alpha_optim = torch.optim.Adam( self.model.alphas(), self.config.alpha_lr, betas=(0.5, 0.999), weight_decay=self.config.alpha_weight_decay) self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self.w_optim, self.total_epochs, eta_min=self.config.w_lr_min) self.architect = Architect(self.model, self.config.w_momentum, self.config.w_weight_decay) def resume_model(self, model_path=None): if model_path is None and not self.resume_path: self.start_epoch = 0 self.logger.info("--> No loaded checkpoint!") else: model_path = model_path or self.resume_path checkpoint = torch.load(model_path, map_location=self.device) self.start_epoch = checkpoint['epoch'] self.steps = checkpoint['steps'] self.model.load_state_dict(checkpoint['model'], strict=True) self.w_optim.load_state_dict(checkpoint['w_optim']) self.alpha_optim.load_state_dict(checkpoint['alpha_optim']) self.logger.info( f"--> Loaded checkpoint '{model_path}'(epoch {self.start_epoch})" ) def save_checkpoint(self, epoch, is_best=False): if epoch % self.save_epoch == 0: state = { 'config': self.config, 'epoch': epoch, 'steps': self.steps, 'model': self.model.state_dict(), 'w_optim': self.w_optim.state_dict(), 'alpha_optim': self.alpha_optim.state_dict() } if is_best: best_filename = os.path.join(self.ckpt_path, 'best.pth.tar') torch.save(state, best_filename) def train_epoch(self, epoch, printer=print): top1 = AverageMeter() top5 = AverageMeter() losses = AverageMeter() cur_lr = self.lr_scheduler.get_last_lr()[0] self.model.print_alphas(self.logger) self.model.train() prefetcher_trn = data_prefetcher(self.train_loader) prefetcher_val = data_prefetcher(self.valid_loader) trn_X, trn_y = prefetcher_trn.next() val_X, val_y = prefetcher_val.next() i = 0 while trn_X is not None: i += 1 N = trn_X.size(0) self.steps += 1 # architect step (alpha) self.alpha_optim.zero_grad() self.architect.unrolled_backward(trn_X, trn_y, val_X, val_y, cur_lr, self.w_optim) self.alpha_optim.step() # child network step (w) self.w_optim.zero_grad() logits = self.model(trn_X) loss = self.model.criterion(logits, trn_y) loss.backward() nn.utils.clip_grad_norm_(self.model.weights(), self.config.w_grad_clip) self.w_optim.step() prec1, prec5 = accuracy(logits, trn_y, topk=(1, 5)) losses.update(loss.item(), N) top1.update(prec1.item(), N) top5.update(prec5.item(), N) if self.steps % self.log_step == 0: self.writer.add_scalar('train/lr', round(cur_lr, 5), self.steps) self.writer.add_scalar('train/loss', loss.item(), self.steps) self.writer.add_scalar('train/top1', prec1.item(), self.steps) self.writer.add_scalar('train/top5', prec5.item(), self.steps) if i % self.config.print_freq == 0 or i == len( self.train_loader) - 1: printer( f'Train: Epoch: [{epoch}][{i}/{len(self.train_loader) - 1}]\t' f'Step {self.steps}\t' f'lr {round(cur_lr, 5)}\t' f'Loss {losses.val:.4f} ({losses.avg:.4f})\t' f'Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})\t') trn_X, trn_y = prefetcher_trn.next() val_X, val_y = prefetcher_val.next() printer("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format( epoch, self.total_epochs - 1, top1.avg)) def val_epoch(self, epoch, printer): top1 = AverageMeter() top5 = AverageMeter() losses = AverageMeter() self.model.eval() prefetcher = data_prefetcher(self.valid_loader) X, y = prefetcher.next() i = 0 with torch.no_grad(): while X is not None: N = X.size(0) i += 1 logits = self.model(X) loss = self.criterion(logits, y) prec1, prec5 = accuracy(logits, y, topk=(1, 5)) losses.update(loss.item(), N) top1.update(prec1.item(), N) top5.update(prec5.item(), N) if i % self.config.print_freq == 0 or i == len( self.valid_loader) - 1: printer( f'Valid: Epoch: [{epoch}][{i}/{len(self.valid_loader)}]\t' f'Step {self.steps}\t' f'Loss {losses.avg:.4f}\t' f'Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})') X, y = prefetcher.next() self.writer.add_scalar('val/loss', losses.avg, self.steps) self.writer.add_scalar('val/top1', top1.avg, self.steps) self.writer.add_scalar('val/top5', top5.avg, self.steps) printer("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format( epoch, self.total_epochs - 1, top1.avg)) return top1.avg
def start(args): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True cudnn.enabled = True logging.info("args = %s", args) dataset = LoadData(args.data_name) if args.data_name == 'SBM_PATTERN': in_dim = 3 num_classes = 2 elif args.data_name == 'SBM_CLUSTER': in_dim = 7 num_classes = 6 print(f"input dimension: {in_dim}, number classes: {num_classes}") criterion = MyCriterion(num_classes) criterion = criterion.cuda() model = Network(args.layers, args.nodes, in_dim, args.feature_dim, num_classes, criterion, args.data_type, args.readout) model = model.cuda() logging.info("param size = %fMB", count_parameters_in_MB(model)) train_data, val_data, test_data = dataset.train, dataset.val, dataset.test num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) print(f"train set full size : {num_train}; split train set size : {split}") train_queue = torch.utils.data.DataLoader( train_data, batch_size = args.batch_size, sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory = True, num_workers=args.workers, collate_fn = dataset.collate) valid_queue = torch.utils.data.DataLoader( train_data, batch_size = args.batch_size, sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), pin_memory = True, num_workers=args.workers, collate_fn = dataset.collate) true_valid_queue = torch.utils.data.DataLoader( val_data, batch_size=args.batch_size, pin_memory=True, num_workers=args.workers, collate_fn=dataset.collate) test_queue = torch.utils.data.DataLoader( test_data, batch_size=args.batch_size, pin_memory=True, num_workers=args.workers, collate_fn=dataset.collate) optimizer = torch.optim.SGD(model.parameters(),args.learning_rate, momentum=args.momentum,weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) # viz = Visdom(env = '{} {}'.format(args.data_name, time.asctime(time.localtime(time.time())) )) viz = None save_file = open(args.save_result, "w") for epoch in range(args.epochs): scheduler.step() lr = scheduler.get_lr()[0] logging.info('[LR]\t%f', lr) if epoch % args.save_freq == 0: print(model.show_genotypes()) save_file.write(f"Epoch : {epoch}\n{model.show_genotypes()}\n") for i in range(args.layers): logging.info('layer = %d', i) genotype = model.show_genotype(i) logging.info('genotype = %s', genotype) ''' w1, w2, w3 = model.show_weights(0) print('[1] weights in first cell\n',w1) print('[2] weights in middle cell\n', w2) print('[3] weights in last cell\n', w3) ''' # training macro_acc, micro_acc, loss = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, epoch, viz) # true validation macro_acc, micro_acc, loss = infer(true_valid_queue, model, criterion, stage = 'validating') # testing macro_acc, micro_acc, loss = infer(test_queue, model, criterion, stage = ' testing ')
class Exp_M_Informer(Exp_Basic): def __init__(self, args): super(Exp_M_Informer, self).__init__(args) def _build_model(self): model_dict = { 'informer': Informer, 'informerstack': InformerStack, } if self.args.model == 'informer' or self.args.model == 'informerstack': e_layers = self.args.e_layers if self.args.model == 'informer' else self.args.s_layers model = model_dict[self.args.model]( self.args.enc_in, self.args.dec_in, self.args.c_out, self.args.seq_len, self.args.label_len, self.args.pred_len, self.args.factor, self.args.d_model, self.args.n_heads, e_layers, # self.args.e_layers, self.args.d_layers, self.args.d_ff, self.args.dropout, self.args.attn, self.args.embed, self.args.freq, self.args.activation, self.args.output_attention, self.args.distil, self.args.mix, self.device, self.args).float() else: raise NotImplementedError # something self.arch = Architect(model, self.device, self.args, self._select_criterion()) return model def _get_data(self, flag): args = self.args data_dict = { 'ETTh1': Dataset_ETT_hour, 'ETTh2': Dataset_ETT_hour, 'ETTm1': Dataset_ETT_minute, 'ETTm2': Dataset_ETT_minute, 'WTH': Dataset_Custom, 'ECL': Dataset_Custom, 'Solar': Dataset_Custom, 'custom': Dataset_Custom, } Data = data_dict[self.args.data] timeenc = 0 if args.embed != 'timeF' else 1 if flag == 'test': shuffle_flag = False drop_last = True batch_size = args.batch_size freq = args.freq elif flag == 'pred': shuffle_flag = False drop_last = False batch_size = 1 freq = args.detail_freq Data = Dataset_Pred else: shuffle_flag = True drop_last = True batch_size = args.batch_size freq = args.freq data_set = Data(root_path=args.root_path, data_path=args.data_path, flag=flag, size=[args.seq_len, args.label_len, args.pred_len], features=args.features, target=args.target, inverse=args.inverse, timeenc=timeenc, freq=freq, cols=args.cols) data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=shuffle_flag, num_workers=args.num_workers, drop_last=drop_last) return data_set, data_loader def _select_optimizer(self): W_optim = optim.Adam(self.model.W(), lr=self.args.learning_rate) A_optim = optim.Adam(self.model.A(), self.args.A_lr, betas=(0.5, 0.999), weight_decay=self.args.A_weight_decay) return W_optim, A_optim def _select_criterion(self): criterion = nn.MSELoss() return criterion def vali(self, vali_data, vali_loader, criterion): self.model.eval() total_loss = [] for i, val_d in enumerate(vali_loader): pred, true = self._process_one_batch(vali_data, val_d) loss = criterion(pred.detach().cpu(), true.detach().cpu()) total_loss.append(loss) total_loss = np.average(total_loss) self.model.train() return total_loss def train(self, ii, logger): train_data, train_loader = self._get_data(flag='train') vali_data, vali_loader = self._get_data(flag='val') next_data, next_loader = self._get_data(flag='train') test_data, test_loader = self._get_data(flag='test') if self.args.rank == 1: train_data, train_loader = self._get_data(flag='train') path = os.path.join(self.args.path, str(ii)) try: os.mkdir(path) except FileExistsError: pass time_now = time.time() train_steps = len(train_loader) early_stopping = EarlyStopping(patience=self.args.patience, verbose=True, rank=self.args.rank) W_optim, A_optim = self._select_optimizer() criterion = self._select_criterion() if self.args.use_amp: scaler = torch.cuda.amp.GradScaler() for epoch in range(self.args.train_epochs): iter_count = 0 train_loss = [] rate_counter = AverageMeter() Ag_counter, A_counter, Wg_counter, W_counter = AverageMeter( ), AverageMeter(), AverageMeter(), AverageMeter() self.model.train() epoch_time = time.time() for i, (trn_data, val_data, next_data) in enumerate( zip(train_loader, vali_loader, next_loader)): for i in range(len(trn_data)): trn_data[i], val_data[i], next_data[i] = trn_data[i].float( ).to(self.device), val_data[i].float().to( self.device), next_data[i].float().to(self.device) iter_count += 1 A_optim.zero_grad() rate = self.arch.unrolled_backward( self.args, trn_data, val_data, next_data, W_optim.param_groups[0]['lr'], W_optim) rate_counter.update(rate) # for r in range(1, self.args.world_size): # for n, h in self.model.named_H(): # if "proj.{}".format(r) in n: # if self.args.rank <= r: # with torch.no_grad(): # dist.all_reduce(h.grad) # h.grad *= self.args.world_size/r+1 # else: # z = torch.zeros(h.shape).to(self.device) # dist.all_reduce(z) for a in self.model.A(): with torch.no_grad(): dist.all_reduce(a.grad) a_g_norm = 0 a_norm = 0 n = 0 for a in self.model.A(): a_g_norm += a.grad.mean() a_norm += a.mean() n += 1 Ag_counter.update(a_g_norm / n) A_counter.update(a_norm / n) A_optim.step() W_optim.zero_grad() pred, true = self._process_one_batch(train_data, trn_data) loss = criterion(pred, true) train_loss.append(loss.item()) if (i + 1) % 100 == 0: logger.info( "\tR{0} iters: {1}, epoch: {2} | loss: {3:.7f}".format( self.args.rank, i + 1, epoch + 1, loss.item())) speed = (time.time() - time_now) / iter_count left_time = speed * ( (self.args.train_epochs - epoch) * train_steps - i) logger.info( '\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format( speed, left_time)) iter_count = 0 time_now = time.time() if self.args.use_amp: scaler.scale(loss).backward() scaler.step(W_optim) scaler.update() else: loss.backward() w_g_norm = 0 w_norm = 0 n = 0 for w in self.model.W(): w_g_norm += w.grad.mean() w_norm += w.mean() n += 1 Wg_counter.update(w_g_norm / n) W_counter.update(w_norm / n) W_optim.step() logger.info("R{} Epoch: {} W:{} Wg:{} A:{} Ag:{} rate{}".format( self.args.rank, epoch + 1, W_counter.avg, Wg_counter.avg, A_counter.avg, Ag_counter.avg, rate_counter.avg)) logger.info("R{} Epoch: {} cost time: {}".format( self.args.rank, epoch + 1, time.time() - epoch_time)) train_loss = np.average(train_loss) vali_loss = self.vali(vali_data, vali_loader, criterion) test_loss = self.vali(test_data, test_loader, criterion) logger.info( "R{0} Epoch: {1}, Steps: {2} | Train Loss: {3:.7f} Vali Loss: {4:.7f} Test Loss: {5:.7f}" .format(self.args.rank, epoch + 1, train_steps, train_loss, vali_loss, test_loss)) early_stopping(vali_loss, self.model, path) flag = torch.tensor( [1]) if early_stopping.early_stop else torch.tensor([0]) flag = flag.to(self.device) flags = [ torch.tensor([1]).to(self.device), torch.tensor([1]).to(self.device) ] dist.all_gather(flags, flag) if flags[0].item() == 1 and flags[1].item() == 1: logger.info("Early stopping") break adjust_learning_rate(W_optim, epoch + 1, self.args) best_model_path = path + '/' + '{}_checkpoint.pth'.format( self.args.rank) self.model.load_state_dict(torch.load(best_model_path)) return self.model def test(self, setting, logger): test_data, test_loader = self._get_data(flag='test') self.model.eval() preds = [] trues = [] for i, test_d in enumerate(test_loader): pred, true = self._process_one_batch(test_data, test_d) preds.append(pred.detach().cpu().numpy()) trues.append(true.detach().cpu().numpy()) preds = np.array(preds) trues = np.array(trues) logger.info('test shape: {} {}'.format(preds.shape, trues.shape)) preds = preds.reshape((-1, preds.shape[-2], preds.shape[-1])) trues = trues.reshape((-1, trues.shape[-2], trues.shape[-1])) logger.info('test shape: {} {}'.format(preds.shape, trues.shape)) # result save folder_path = './results/' + setting + '/' if not os.path.exists(folder_path): os.makedirs(folder_path) mae, mse, rmse, mape, mspe = metric(preds, trues) logger.info('R{} mse:{}, mae:{}'.format(self.args.rank, mse, mae)) np.save(folder_path + 'metrics.npy', np.array([mae, mse, rmse, mape, mspe])) np.save(folder_path + 'pred.npy', preds) np.save(folder_path + 'true.npy', trues) return def predict(self, setting, load=False): pred_data, pred_loader = self._get_data(flag='pred') if load: path = os.path.join(self.args.checkpoints, setting) best_model_path = path + '/' + 'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) self.model.eval() preds = [] for i, pred_d in enumerate(pred_loader): pred, true = self._process_one_batch(pred_data, pred_d) preds.append(pred.detach().cpu().numpy()) preds = np.array(preds) preds = preds.reshape((-1, preds.shape[-2], preds.shape[-1])) # result save folder_path = './results/' + setting + '/' if not os.path.exists(folder_path): os.makedirs(folder_path) np.save(folder_path + 'real_prediction.npy', preds) return def _process_one_batch(self, dataset_object, data): batch_x = data[0].float().to(self.device) batch_y = data[1].float().to(self.device) batch_x_mark = data[2].float().to(self.device) batch_y_mark = data[3].float().to(self.device) # decoder input if self.args.padding == 0: dec_inp = torch.zeros( [batch_y.shape[0], self.args.pred_len, batch_y.shape[-1]]).float().to(self.device) elif self.args.padding == 1: dec_inp = torch.ones( [batch_y.shape[0], self.args.pred_len, batch_y.shape[-1]]).float().to(self.device) dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device) # encoder - decoder if self.args.use_amp: with torch.cuda.amp.autocast(): if self.args.output_attention: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] else: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] else: if self.args.output_attention: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] else: outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] if self.args.inverse: outputs = dataset_object.inverse_transform(outputs) f_dim = -1 if self.args.features == 'MS' else 0 batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) return outputs, batch_y
class Searcher(object): def __init__(self, args): self.args = args self.console = Console() self.console.log('=> [1] Initial settings') np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True cudnn.enabled = True self.console.log('=> [2] Initial models') self.metric = load_metric(args) self.loss_fn = get_loss_fn(args).cuda() self.model = Model_Search(args, get_trans_input(args), self.loss_fn).cuda() self.console.log( f'=> Supernet Parameters: {count_parameters_in_MB(self.model)}', style='bold red') self.console.log(f'=> [3] Preparing dataset') self.dataset = load_data(args) if args.pos_encode > 0: #! add positional encoding self.console.log(f'==> [3.1] Adding positional encodings') self.dataset._add_positional_encodings(args.pos_encode) self.search_data = self.dataset.train self.val_data = self.dataset.val self.test_data = self.dataset.test self.load_dataloader() self.console.log(f'=> [4] Initial optimizer') self.optimizer = torch.optim.SGD(params=self.model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer=self.optimizer, T_max=float(args.epochs), eta_min=args.lr_min) self.architect = Architect(self.model, self.args) def load_dataloader(self): num_search = int(len(self.search_data) * self.args.data_clip) indices = list(range(num_search)) split = int(np.floor(self.args.portion * num_search)) self.console.log( f'=> Para set size: {split}, Arch set size: {num_search - split}') self.para_queue = torch.utils.data.DataLoader( dataset=self.search_data, batch_size=self.args.batch, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[:split]), pin_memory=True, num_workers=self.args.nb_workers, collate_fn=self.dataset.collate) self.arch_queue = torch.utils.data.DataLoader( dataset=self.search_data, batch_size=self.args.batch, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:]), pin_memory=True, num_workers=self.args.nb_workers, collate_fn=self.dataset.collate) num_valid = int(len(self.val_data) * self.args.data_clip) indices = list(range(num_valid)) self.val_queue = torch.utils.data.DataLoader( dataset=self.val_data, batch_size=self.args.batch, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices), pin_memory=True, num_workers=self.args.nb_workers, collate_fn=self.dataset.collate) num_test = int(len(self.test_data) * self.args.data_clip) indices = list(range(num_test)) self.test_queue = torch.utils.data.DataLoader( dataset=self.test_data, batch_size=self.args.batch, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices), pin_memory=True, num_workers=self.args.nb_workers, collate_fn=self.dataset.collate) def run(self): self.console.log(f'=> [4] Search & Train') for i_epoch in range(self.args.epochs): self.scheduler.step() self.lr = self.scheduler.get_lr()[0] if i_epoch % self.args.report_freq == 0: geno = genotypes( args=self.args, arch_paras=self.model.group_arch_parameters(), arch_topos=self.model.cell_arch_topo, ) with open( f'{self.args.arch_save}/{self.args.data}/{i_epoch}.yaml', "w") as f: yaml.dump(geno, f) # => report genotype self.console.log(geno) for i in range(self.args.nb_layers): for p in self.model.group_arch_parameters()[i]: self.console.log(p.softmax(0).detach().cpu().numpy()) search_result = self.search() self.console.log( f"[green]=> search result [{i_epoch}] - loss: {search_result['loss']:.4f} - metric : {search_result['metric']:.4f}", ) # DecayScheduler().step(i_epoch) with torch.no_grad(): val_result = self.infer(self.val_queue) self.console.log( f"[yellow]=> valid result [{i_epoch}] - loss: {val_result['loss']:.4f} - metric : {val_result['metric']:.4f}" ) test_result = self.infer(self.test_queue) self.console.log( f"[red]=> test result [{i_epoch}] - loss: {test_result['loss']:.4f} - metric : {test_result['metric']:.4f}" ) def search(self): self.model.train() epoch_loss = 0 epoch_metric = 0 desc = '=> searching' device = torch.device('cuda') with tqdm(self.para_queue, desc=desc, leave=False) as t: for i_step, (batch_graphs, batch_targets) in enumerate(t): #! 1. preparing training datasets G = batch_graphs.to(device) V = batch_graphs.ndata['feat'].to(device) # E = batch_graphs.edata['feat'].to(device) batch_targets = batch_targets.to(device) #! 2. preparing validating datasets batch_graphs_search, batch_targets_search = next( iter(self.arch_queue)) GS = batch_graphs_search.to(device) VS = batch_graphs_search.ndata['feat'].to(device) # ES = batch_graphs_search.edata['feat'].to(device) batch_targets_search = batch_targets_search.to(device) #! 3. optimizing architecture topology parameters self.architect.step(input_train={ 'G': G, 'V': V }, target_train=batch_targets, input_valid={ 'G': GS, 'V': VS }, target_valid=batch_targets_search, eta=self.lr, network_optimizer=self.optimizer, unrolled=self.args.unrolled) #! 4. optimizing model parameters self.optimizer.zero_grad() batch_scores = self.model({'G': G, 'V': V}) loss = self.loss_fn(batch_scores, batch_targets) loss.backward() self.optimizer.step() epoch_loss += loss.detach().item() epoch_metric += self.metric(batch_scores, batch_targets) t.set_postfix(lr=self.lr, loss=epoch_loss / (i_step + 1), metric=epoch_metric / (i_step + 1)) return { 'loss': epoch_loss / (i_step + 1), 'metric': epoch_metric / (i_step + 1) } def infer(self, dataloader): self.model.eval() epoch_loss = 0 epoch_metric = 0 desc = '=> inferring' device = torch.device('cuda') with tqdm(dataloader, desc=desc, leave=False) as t: for i_step, (batch_graphs, batch_targets) in enumerate(t): G = batch_graphs.to(device) V = batch_graphs.ndata['feat'].to(device) # E = batch_graphs.edata['feat'].to(device) batch_targets = batch_targets.to(device) batch_scores = self.model({'G': G, 'V': V}) loss = self.loss_fn(batch_scores, batch_targets) epoch_loss += loss.detach().item() epoch_metric += self.metric(batch_scores, batch_targets) t.set_postfix(loss=epoch_loss / (i_step + 1), metric=epoch_metric / (i_step + 1)) return { 'loss': epoch_loss / (i_step + 1), 'metric': epoch_metric / (i_step + 1) }
class SearchDistributionTrainer(SearchStageTrainer): def __init__(self, config): super().__init__(config) def construct_model(self): """get data loader""" input_size, input_channels, n_classes, train_data = get_data( self.config.dataset, self.config.data_path, cutout_length=0, validation=False) n_train = len(train_data) split = n_train // 2 indices = list(range(n_train)) train_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[:split]) valid_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[split:]) self.train_loader = torch.utils.data.DataLoader( train_data, batch_size=self.config.batch_size, sampler=train_sampler, num_workers=self.config.workers, pin_memory=True) self.valid_loader = torch.utils.data.DataLoader( train_data, batch_size=self.config.batch_size, sampler=valid_sampler, num_workers=self.config.workers, pin_memory=True) """build model""" print("init model") self.criterion = nn.CrossEntropyLoss().to(self.device) model = SearchDistributionController(input_channels, self.config.init_channels, n_classes, self.config.layers, self.criterion, self.config.genotype, device_ids=self.config.gpus) self.model = model.to(self.device) print("init model end!") """build optimizer""" print("get optimizer") self.w_optim = torch.optim.SGD(self.model.weights(), self.config.w_lr, momentum=self.config.w_momentum, weight_decay=self.config.w_weight_decay) self.alpha_optim = torch.optim.Adam( self.model.alphas(), self.config.alpha_lr, betas=(0.5, 0.999), weight_decay=self.config.alpha_weight_decay) self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self.w_optim, self.total_epochs, eta_min=self.config.w_lr_min) self.architect = Architect(self.model, self.config.w_momentum, self.config.w_weight_decay) def cal_depth(self, alpha, n_nodes, SW, beta): assert len( alpha ) == n_nodes, "the length of alpha must be the same as n_nodes" d = [0, 0] for i, edges in enumerate(alpha): edge_max, _ = torch.topk(edges[:, :-1], 1) edge_max = F.softmax(edge_max, dim=0) if i < SW - 2: dd = 0 for j in range(i + 2): dd += edge_max[j][0] * (d[j] + 1) dd /= (i + 2) else: dd = 0 for s, j in enumerate(range(i - 1, i + 2)): dd += edge_max[s][0] * (d[j] + 1) dd /= SW if i >= 3: dd *= (1 + i * beta[i - 3])[0] d.append(dd) return sum(d) / n_nodes def concat_param_loss(self, beta): loss = sum([beta[i][j] * (j + 4) for i in range(3) for j in range(5)]) return loss def train_epoch(self, epoch, printer): top1 = AverageMeter() top5 = AverageMeter() losses = AverageMeter() cur_lr = self.lr_scheduler.get_last_lr()[0] self.model.print_alphas(self.logger) self.model.train() prefetcher_trn = data_prefetcher(self.train_loader) prefetcher_val = data_prefetcher(self.valid_loader) trn_X, trn_y = prefetcher_trn.next() val_X, val_y = prefetcher_val.next() i = 0 while trn_X is not None: i += 1 N = trn_X.size(0) self.steps += 1 # architect step (alpha) self.alpha_optim.zero_grad() self.architect.unrolled_backward(trn_X, trn_y, val_X, val_y, cur_lr, self.w_optim) self.alpha_optim.step() self.alpha_optim.zero_grad() alpha = self.architect.net.alpha_DAG beta = [ F.softmax(be, dim=0) for be in self.architect.net.alpha_concat ] self.n_nodes = self.config.layers // 3 d_depth1 = self.cal_depth(alpha[0 * self.n_nodes:1 * self.n_nodes], self.n_nodes, 3, beta[0]) d_depth2 = self.cal_depth(alpha[1 * self.n_nodes:2 * self.n_nodes], self.n_nodes, 3, beta[1]) d_depth3 = self.cal_depth(alpha[2 * self.n_nodes:3 * self.n_nodes], self.n_nodes, 3, beta[2]) depth_loss = -1 * (d_depth1 + d_depth2 + d_depth3) param_loss = self.concat_param_loss(beta) new_loss = depth_loss + 0.4 * param_loss new_loss.backward() self.alpha_optim.step() # child network step (w) self.w_optim.zero_grad() logits = self.model(trn_X) loss = self.model.criterion(logits, trn_y) loss.backward() nn.utils.clip_grad_norm_(self.model.weights(), self.config.w_grad_clip) self.w_optim.step() prec1, prec5 = accuracy(logits, trn_y, topk=(1, 5)) losses.update(loss.item(), N) top1.update(prec1.item(), N) top5.update(prec5.item(), N) if self.steps % self.log_step == 0: self.writer.add_scalar('train/lr', round(cur_lr, 5), self.steps) self.writer.add_scalar('train/loss', loss.item(), self.steps) self.writer.add_scalar('train/top1', prec1.item(), self.steps) self.writer.add_scalar('train/top5', prec5.item(), self.steps) if i % self.config.print_freq == 0 or i == len( self.train_loader) - 1: printer( f'Train: Epoch: [{epoch}][{i}/{len(self.train_loader) - 1}]\t' f'Step {self.steps}\t' f'lr {round(cur_lr, 5)}\t' f'Loss {losses.val:.4f} ({losses.avg:.4f})\t' f'Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})\t') trn_X, trn_y = prefetcher_trn.next() val_X, val_y = prefetcher_val.next() printer("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format( epoch, self.total_epochs - 1, top1.avg))