def on_epoch_end(self, epoch, logs=None): logs = logs or {} self.epochs_since_last_save += 1 if self.epochs_since_last_save >= self.period: self.epochs_since_last_save = 0 if self.save_best_only: current = logs.get(self.monitor) if current is None: logging.warning( 'Can save best model only with %s available, ' 'skipping.' % (self.monitor), RuntimeWarning) else: if self.monitor_op(current, self.best): if self.verbose > 0: print( 'Epoch %05d: %s improved from %0.5f to %0.5f,' ' saving model to %s' % (epoch, self.monitor, self.best, current, self.filepath)) self.best = current save_model(self.model, self.optimizer, self.filepath) else: if self.verbose > 0: print('Epoch %05d: %s did not improve' % (epoch, self.monitor)) else: if self.verbose > 0: print('Epoch %05d: saving model to %s' % (epoch, self.filepath)) save_model(self.model, self.optimizer, self.filepath)
def train_loop(self, train_loader, valid_loader, logging, writer=None): best_error = float('inf') train_error_metric = train_obj = train_main_obj = train_ece = train_kl = None for epoch in range(self.args.epochs): if epoch >= 1 and self.scheduler is not None: self.scheduler.step() if self.scheduler is not None: lr = self.scheduler.get_last_lr()[0] else: lr = self.args.learning_rate if writer is not None: writer.add_scalar('Train/learning_rate', lr, epoch) writer.add_scalar('Train/gamma', self.gamma_scheduler[epoch], epoch) logging.info( '### Epoch: [%d/%d], Learning rate: %e, Gamma: %e ###', self.args.epochs, epoch, lr, self.gamma_scheduler[epoch]) train_obj, train_main_obj, train_kl, train_error_metric, train_ece = self.train( epoch, train_loader, self.optimizer, logging, writer) logging.info( '#### Train | Error: %f, Train loss: %f, Train main objective: %f, Train KL: %f, Train ECE %f ####', train_error_metric, train_obj, train_main_obj, train_kl, train_ece) if writer is not None: self._scalar_logging(train_obj, train_main_obj, train_kl, train_error_metric, train_ece, "Train/", epoch, writer) # validation val_obj, val_main_obj, val_kl, val_error_metric, val_ece = self.infer( epoch, valid_loader, logging, writer, "Valid") logging.info( '#### Valid | Error: %f, Valid loss: %f, Valid main objective: %f, Valid KL: %f, Valid ECE %f ####', val_error_metric, val_obj, val_main_obj, val_kl, val_ece) if writer is not None: self._scalar_logging(val_obj, val_main_obj, val_kl, val_error_metric, val_ece, "Valid/", epoch, writer) if val_error_metric <= best_error or self.args.save_last: special_infor = "" # Avoid correlation between the samples if hasattr( self.args, 'burnin_epochs' ) and epoch >= self.args.burnin_epochs and epoch % 2 == 0: special_infor = "_" + str(epoch) utils.save_model(self.model, self.args, special_infor) best_error = val_error_metric logging.info( '### Epoch: [%d/%d], Saving model! Current best error: %f ###', self.args.epochs, epoch, best_error) return best_error, self.train_time, self.val_time
def fit(self, X, y): """ Classify given data Input: X: (N, M) matrix of N training data samples y: (N) vector of N training data labels """ self.__clf.fit(X, y) if self.__pickle is not None: utils.save_model(self.__clf, self.__pickle)
def hmm_train_eval(train_data, test_data, word2id, tag2id, remove_0=False): train_word_lists, train_tag_lists = train_data test_word_lists, test_tag_lists = test_data model = HMM(len(tag2id), len(word2id)) model.train(train_word_lists, train_tag_lists, word2id, tag2id) save_model(model, '../models/st_models/deepNER/hmm.pkl') pred_tag_lists = model.test(test_word_lists, word2id, tag2id) metrics = Metrics(test_tag_lists, pred_tag_lists) metrics.report_scores(dtype='HMM')
def crf_train_eval(train_data, test_data, remove_0=False): print('CRF模型评估训练') train_word_lists, train_tag_lists = train_data test_word_lists, test_tag_lists = test_data model = CRFModel() model.train(train_word_lists, train_tag_lists) save_model(model, '../models/st_models/deepNER/crf.pkl') pred_tag_lists = model.test(test_word_lists) metrics = Metrics(test_tag_lists, pred_tag_lists) metrics.report_scores(dtype='CRF')
# create network torch.manual_seed(seed) n_in, n_out = get_data_dimensions(dataset) net = Net(n_in, n_hidden, n_out, n_layer, act=act, noise_type=noise_type, noise_level=noise_level, init_val=init_val).to(device, dtype) save_model(net, experiment_name, 0, noise_type, noise_level, model_dir=model_dir) else: print("starting from epoch {}".format(start_epoch)) net = recreate_model(model_to_load, dataset=dataset, act=act) # optimiser parameters optimiser = get_optimiser(net.parameters(), op, learning_rate, momentum) # training criterion criterion = torch.nn.CrossEntropyLoss() # train network train(net,
def save_to_mlflow(self, is_remote=False): save_model(self, log_to_mlflow=True, is_remote=is_remote)
def train_vcae(n_epochs, model, train_iterator, val_iterator, optimizer, device, criterion, save_best=True, verbose=True, is_nf=False, nf=None): model_name = 'NormalizingFlow' + model.__class__.__name__ if is_nf else model.__class__.__name__ writer, experiment_name, best_model_path = setup_experiment(model_name, log_dir="./tb") mb = master_bar(range(n_epochs)) train_losses, val_losses = [], [] best_val_loss = float('+inf') for epoch in mb: train_loss = run_epoch(model, train_iterator, optimizer, criterion, mb, phase='train', epoch=epoch, writer=writer, is_nf=is_nf, nf=nf, device=device) val_loss = run_epoch(model, val_iterator, None, criterion, mb, phase='val', epoch=epoch, writer=writer, is_nf=is_nf, nf=nf, device=device) # save logs dict_saver = {} dict_saver.update({'train_loss_mean': train_loss}) dict_saver.update({'test_loss_mean': val_loss}) file_to_save_path = ''.join( [LOG_PATH, FILE_NAME, experiment_name, FILE_EXCITON]) save_to_file(file_to_save_path, dict_saver) # save the best model if save_best and (val_loss < best_val_loss): best_val_loss = val_loss save_model(nf if is_nf else model, best_model_path) if verbose: # append to a list for real-time plotting train_losses.append(train_loss) val_losses.append(val_loss) # start plotting for notebook mb.main_bar.comment = f'EPOCHS, best_loss:{best_val_loss}' mb.child.comment = f"train_loss:{round(train_loss, 3)}, val_loss:{round(val_loss, 3)}" plot_loss_update(epoch, n_epochs, mb, train_losses, val_losses) return best_model_path
def train(args, model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): device = args.device lr_default = args.lr lr_decay_step = 2 lr_decay_rate = .25 lr_decay_epochs = range( 10, 20, lr_decay_step) if eval_loader is not None else range( 10, 20, lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] saving_epoch = 0 grad_clip = args.clip_norm utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt # Initial loss function criterion = torch.nn.BCEWithLogitsLoss(reduction='sum') logger = utils.Logger(os.path.join(output, 'log.txt')) logger.write(args.__repr__()) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) # Create trainer trainer = Trainer(args, model, criterion, optim) update_freq = int(args.update_freq) wall_time_start = time.time() for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 num_updates = 0 t = time.time() N = len(train_loader.dataset) num_batches = int(N / args.batch_size + 1) if epoch < len(gradual_warmup_steps): trainer.optimizer.param_groups[0]['lr'] = gradual_warmup_steps[ epoch] logger.write('gradual warmup lr: %.8f' % trainer.optimizer.param_groups[0]['lr']) elif epoch in lr_decay_epochs: trainer.optimizer.param_groups[0]['lr'] *= lr_decay_rate logger.write('decreased lr: %.8f' % trainer.optimizer.param_groups[0]['lr']) else: logger.write('lr: %.8f' % trainer.optimizer.param_groups[0]['lr']) for i, (v, b, q, a, ans_mc, ans_gt) in enumerate(train_loader): v = v.to(device) b = b.to(device) q = q.to(device) a = a.to(device) ans_mc = ans_mc.to(device) # Clone each sample to 4 samples v = v.unsqueeze(1).expand(v.size(0), 4, v.size(1), v.size(2)).contiguous().view( v.size(0) * 4, v.size(1), v.size(2)) q = q.unsqueeze(1).expand(q.size(0), 4, q.size(1)).contiguous().view( q.size(0) * 4, q.size(1)) ans_mc = ans_mc.view( ans_mc.size(0) * ans_mc.size(1), ans_mc.size(2)) a = a.view(ans_mc.size(0), 1) labels = torch.cat([a, 1 - a], 1) labels = labels.to(device) sample = [v, b, q, labels, ans_mc] if i < num_batches - 1 and (i + 1) % update_freq > 0: trainer.train_step(sample, update_params=False) else: loss, grad_norm, batch_score = trainer.train_step( sample, update_params=True) total_norm += grad_norm count_norm += 1 total_loss += loss.item() train_score += batch_score num_updates += 1 if num_updates % int(args.print_interval / update_freq) == 0: print( "Iter: {}, Loss {:.4f}, Norm: {:.4f}, Total norm: {:.4f}, Num updates: {}, Wall time: {:.2f}," " ETA: {}".format(i + 1, total_loss / ((num_updates + 1)), grad_norm, total_norm, num_updates, time.time() - wall_time_start, utils.time_since(t, i / num_batches))) total_loss /= num_updates train_score = 100 * train_score / (num_updates * args.batch_size) if eval_loader is not None: print("Evaluating...") trainer.model.train(False) eval_score, bound = evaluate(model, eval_loader, args) trainer.model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) # Save per epoch if epoch >= saving_epoch: model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, trainer.optimizer) # Save best epoch if eval_loader is not None and eval_score > best_eval_score: model_path = os.path.join(output, 'model_epoch_best.pth') utils.save_model(model_path, model, epoch, trainer.optimizer) best_eval_score = eval_score
def main(optin): if not os.path.exists('checkpoint/' + optin.exp): os.makedirs('checkpoint/' + optin.exp) model = PRN(optin.node_count, optin.coeff).cuda() #model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=optin.lr) criterion = torch.nn.BCELoss().cuda() print(model) print(">>> total params: {:.2f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) save_options(optin, os.path.join('checkpoint/' + optin.exp), model.__str__(), criterion.__str__(), optimizer.__str__()) print('---------Loading Coco Training Set--------') coco_train = COCO( os.path.join('data/annotations/person_keypoints_train2017.json')) trainloader = DataLoader(dataset=CocoDataset(coco_train, optin), batch_size=optin.batch_size, num_workers=optin.num_workers, shuffle=True) bar = Bar('-->', fill='>', max=len(trainloader)) cudnn.benchmark = True for epoch in range(optin.number_of_epoch): print('-------------Training Epoch {}-------------'.format(epoch)) print('Total Step:', len(trainloader), '| Total Epoch:', optin.number_of_epoch) lr = adjust_lr(optimizer, epoch, optin.lr_gamma) print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) for idx, (input, label) in tqdm(enumerate(trainloader)): input = input.cuda().float() label = label.cuda().float() outputs = model(input) optimizer.zero_grad() loss = criterion(outputs, label) loss.backward() optimizer.step() if idx % 200 == 0: bar.suffix = 'Epoch: {epoch} Total: {ttl} | ETA: {eta:} | loss:{loss}' \ .format(ttl=bar.elapsed_td, eta=bar.eta_td, loss=loss.data, epoch=epoch) bar.next() Evaluation(model, optin) save_model( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, checkpoint='checkpoint/' + optin.exp) model.train()
bleus_2_val.append(bleu_2_val) bleus_3_val.append(bleu_3_val) bleus_4_val.append(bleu_4_val) meteors_val.append(meteor_score_val) # save evaluation scores of the validation save_list_to_file(meteors_val, save_model_path, 'meteor_val_list.json') save_list_to_file(bleus_1_val, save_model_path, 'bleus1_val_list.json') save_list_to_file(bleus_2_val, save_model_path, 'bleus2_val_list.json') save_list_to_file(bleus_3_val, save_model_path, 'bleus3_val_list.json') save_list_to_file(bleus_4_val, save_model_path, 'bleus4_val_list.json') # save model if model achieves better evaluation scores on the validation set meteor_best = save_model(reference_value=meteor_best, candidate_value=meteor_score_val, model=model, path=save_model_path, model_name='model_val_meteor') meteor_train_best = save_model(reference_value=meteor_train_best, candidate_value=meteor_score_train, model=model, path=save_model_path, model_name='model_train_meteor') save_model(reference_value=b1_best, candidate_value=bleu_1_val, model=model, path=save_model_path, model_name='model_bleu_1') save_model(reference_value=b2_best, candidate_value=bleu_2_val, model=model,
def train(self, model, tr_loader, va_loader=None, adv_train=False): args = self.args logger = self.logger opt = torch.optim.SGD(model.parameters(), args.learning_rate, weight_decay=args.weight_decay, momentum=args.momentum) scheduler = torch.optim.lr_scheduler.MultiStepLR( opt, milestones=[40000, 60000], gamma=0.1) _iter = 0 begin_time = time() for epoch in range(1, args.max_epoch + 1): for data, label in tr_loader: data, label = tensor2cuda(data), tensor2cuda(label) if adv_train: # When training, the adversarial example is created from a random # close point to the original data point. If in evaluation mode, # just start from the original data point. adv_data = self.attack.perturb(data, label, 'mean', True) output = model(adv_data, _eval=False) else: output = model(data, _eval=False) loss = F.cross_entropy(output, label) opt.zero_grad() loss.backward() opt.step() if _iter % args.n_eval_step == 0: t1 = time() if adv_train: with torch.no_grad(): stand_output = model(data, _eval=True) pred = torch.max(stand_output, dim=1)[1] # print(pred) std_acc = evaluate(pred.cpu().numpy(), label.cpu().numpy()) * 100 pred = torch.max(output, dim=1)[1] # print(pred) adv_acc = evaluate(pred.cpu().numpy(), label.cpu().numpy()) * 100 else: adv_data = self.attack.perturb(data, label, 'mean', False) with torch.no_grad(): adv_output = model(adv_data, _eval=True) pred = torch.max(adv_output, dim=1)[1] # print(label) # print(pred) adv_acc = evaluate(pred.cpu().numpy(), label.cpu().numpy()) * 100 pred = torch.max(output, dim=1)[1] # print(pred) std_acc = evaluate(pred.cpu().numpy(), label.cpu().numpy()) * 100 t2 = time() logger.info( f'epoch: {epoch}, iter: {_iter}, lr={opt.param_groups[0]["lr"]}, ' f'spent {time()-begin_time:.2f} s, tr_loss: {loss.item():.3f}' ) logger.info( f'standard acc: {std_acc:.3f}%, robustness acc: {adv_acc:.3f}%' ) # begin_time = time() # if va_loader is not None: # va_acc, va_adv_acc = self.test(model, va_loader, True) # va_acc, va_adv_acc = va_acc * 100.0, va_adv_acc * 100.0 # logger.info('\n' + '='*30 + ' evaluation ' + '='*30) # logger.info('test acc: %.3f %%, test adv acc: %.3f %%, spent: %.3f' % ( # va_acc, va_adv_acc, time() - begin_time)) # logger.info('='*28 + ' end of evaluation ' + '='*28 + '\n') begin_time = time() if _iter % args.n_store_image_step == 0: tv.utils.save_image( torch.cat([data.cpu(), adv_data.cpu()], dim=0), os.path.join(args.log_folder, f'images_{_iter}.jpg'), nrow=16) if _iter % args.n_checkpoint_step == 0: file_name = os.path.join(args.model_folder, f'checkpoint_{_iter}.pth') save_model(model, file_name) _iter += 1 # scheduler depends on training interation scheduler.step() if va_loader is not None: t1 = time() va_acc, va_adv_acc = self.test(model, va_loader, True, False) va_acc, va_adv_acc = va_acc * 100.0, va_adv_acc * 100.0 t2 = time() logger.info('\n'+'='*20 +f' evaluation at epoch: {epoch} iteration: {_iter} ' \ +'='*20) logger.info( f'test acc: {va_acc:.3f}%, test adv acc: {va_adv_acc:.3f}%, spent: {t2-t1:.3f} s' ) logger.info('=' * 28 + ' end of evaluation ' + '=' * 28 + '\n')
def main(): arg = args() if not os.path.exists(arg.exp_name): os.makedirs(arg.exp_name) assert arg.exp_name.split( '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..." output_dir = arg.exp_name if arg.local_rank == 0: save_scripts_in_exp_dir(output_dir) logger = logging_set(output_dir, arg.local_rank) logger.info(arg) logger.info( '\n================ experient name:[{}] ===================\n'.format( arg.exp_name)) os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True np.random.seed(0) torch.manual_seed(0) config = edict(yaml.load(open(arg.cfg, 'r'))) if arg.search: assert arg.search in [ 'None', 'sync', 'random', 'second_order_gradient', 'first_order_gradient' ] config.train.arch_search_strategy = arg.search if arg.batchsize: logger.info("update batchsize to {}".format(arg.batchsize)) config.train.batchsize = arg.batchsize config.num_workers = arg.num_workers print( 'GPU memory : \ntotal | used\n', os.popen( 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' ).read()) logger.info( '------------------------------ configuration ---------------------------' ) logger.info( '\n==> available {} GPUs , use numbers are {} device is {}\n'.format( torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"], torch.cuda.current_device())) # torch.cuda._initialized = True logger.info(pprint.pformat(config)) logger.info( '------------------------------- -------- ----------------------------' ) best = 0 criterion = MSELoss() Arch = bulid_up_network(config, criterion) if config.train.arch_search_strategy == 'random': logger.info("==>random seed is {}".format(config.train.random_seed)) np.random.seed(config.train.random_seed) torch.manual_seed(config.train.random_seed) Arch.arch_parameters_random_search() if arg.param_flop: Arch._print_info() if len(arg.gpu) > 1: use_multi_gpu = True if arg.distributed: torch.distributed.init_process_group(backend="nccl") #torch.distributed.init_process_group(backend="nccl",init_method='env://') local_rank = torch.distributed.get_rank() torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) Arch.to(device) Arch = torch.nn.parallel.DistributedDataParallel( Arch, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) logger.info("local rank = {}".format(local_rank)) else: Arch = torch.nn.DataParallel(Arch).cuda() else: use_multi_gpu = False Arch = Arch.cuda() Search = Search_Arch(Arch.module, config) if use_multi_gpu else Search_Arch( Arch, config) # Arch.module for nn.DataParallel search_strategy = config.train.arch_search_strategy if not arg.distributed: train_queue, arch_queue, valid_queue = Dataloaders( search_strategy, config, arg) else: train_queue, \ arch_queue, \ valid_queue, \ train_sampler_dist, = Dataloaders(search_strategy,config,arg) #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None! logger.info( "\nNeural Architecture Search strategy is {}".format(search_strategy)) assert search_strategy in [ 'first_order_gradient', 'random', 'None', 'second_order_gradient', 'sync' ] if search_strategy == 'sync': # arch_parameters is also registered to model's parameters # so the weight-optimizer will also update the arch_parameters logger.info( "sync: The arch_parameters is also optimized by weight-optmizer synchronously" ) optimizer = torch.optim.Adam( Arch.parameters(), lr=config.train.w_lr_cosine_begin, ) else: # if search strategy is None,random,second_order_gradient and so on # the arch_parameters will be filtered by the weight-optimizer optimizer = torch.optim.Adam( filter_arch_parameters(Arch), lr=config.train.w_lr_cosine_begin, ) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = config.train.lr_step_size, # gamma = config.train.lr_decay_gamma ) if config.train.scheduler_name == "MultiStepLR": scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.train.LR_STEP, config.train.LR_FACTOR) elif config.train.scheduler_name == "CosineAnnealingLR": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config.train.epoch_end, eta_min=config.train.w_lr_cosine_end) # best_result logger.info( "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+==" ) begin, end = config.train.epoch_begin, config.train.epoch_end if arg.load_ckpt: if use_multi_gpu: begin, best = load_ckpt(Arch.module, optimizer, scheduler, output_dir, logger) else: begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir, logger) for epoch in range(begin, end): lr = scheduler.get_lr()[0] logger.info( '==>time:({})--training...... current learning rate is {:.7f}'. format(datetime.datetime.now(), lr)) if arg.distributed: train_sampler_dist.set_epoch(epoch) #valid_sampler_dist.set_epoch(epoch) train( epoch, train_queue, arch_queue, Arch, Search, criterion, optimizer, lr, search_strategy, output_dir, logger, config, arg, ) scheduler.step() if not arg.distributed or (arg.distributed and arg.local_rank == 0): eval_results = evaluate(Arch, valid_queue, config, output_dir) if use_multi_gpu: best = save_model(epoch, best, eval_results, Arch.module, optimizer, scheduler, output_dir, logger) else: best = save_model(epoch, best, eval_results, Arch, optimizer, scheduler, output_dir, logger)
def on_batch_k_examples(self, batch, logs): save_model(model=self.model, optimizer=self.optimizer, filename=os.path.join(self.save_path, f"model_it{self.it}")) self.it += 1
def train(self, train, dev=None, test=None, to_predict=None, max_iter=5, batch_size=128, test_batch_size=1000, pre_test_batch=25, predict_path=None): train_x, train_y = train self.set_train_data(train) train_index = align_batch_size(range(len(train_y)), batch_size) train_x_length = np.sum((train_x > 0), axis=1) num_batch = len(train_index) / batch_size batch_list = range(num_batch) log_loss_history, acc_history = list(), list() batch_log_loss_history, batch_acc_history = list(), list() logger.info("start training") batch_count = 0 best_dev_acc = 0 for i in xrange(max_iter): iter_loss_list = list() iter_acc_list = list() batch_list = np.random.permutation(batch_list) for j in batch_list: set_dropout_on(True) batch_count += 1 indexs = train_index[j * batch_size:(j + 1) * batch_size] max_len = np.max(train_x_length[indexs]) self.train_batch(indexs, max_len) if batch_count % pre_test_batch == 0: set_dropout_on(False) batch_log_loss, batch_acc = [batch_count], [batch_count] if dev is not None: dev_x, dev_y = dev dev_acc, dev_log_loss = self.predict_data_log_loss_acc( dev_x, dev_y, test_batch_size) batch_log_loss.append(dev_log_loss) batch_acc.append(dev_acc) if dev_acc > best_dev_acc: best_dev_acc = dev_acc save_model("model/%s.best.model" % predict_path, self) logger.info("batch %d, dev log loss %s, acc %s" % (batch_count, dev_log_loss, dev_acc)) if test is not None: test_x, test_y = test test_acc, test_log_loss = self.predict_data_log_loss_acc( test_x, test_y, test_batch_size) batch_log_loss.append(test_log_loss) batch_acc.append(test_acc) logger.info("batch %d, test log loss %s, acc %s" % (batch_count, test_log_loss, test_acc)) batch_log_loss_history.append(batch_log_loss) batch_acc_history.append(batch_acc) set_dropout_on(False) train_acc, train_log_loss = self.predict_data_log_loss_acc( train_x, train_y, test_batch_size) iter_loss_list.append(train_log_loss) iter_acc_list.append(train_acc) iter_l2_loss, iter_l2_norm = self.get_l2_loss() logger.info("epoch %d, param l2 losss %s, l2 norm %s" % (i, iter_l2_loss, iter_l2_norm)) logger.info("epoch %d, train log loss %s, acc %s" % (i, train_log_loss, train_acc)) if dev is not None: dev_x, dev_y = dev dev_acc, dev_log_loss = self.predict_data_log_loss_acc( dev_x, dev_y, test_batch_size) logger.info("epoch %d, dev log loss %s, acc %s" % (i, dev_log_loss, dev_acc)) if dev_acc > best_dev_acc: best_dev_acc = dev_acc save_model("model/%s.best.model" % predict_path, self) iter_loss_list.append(dev_log_loss) iter_acc_list.append(dev_acc) if test is not None: test_x, test_y = test test_acc, test_log_loss = self.predict_data_log_loss_acc( test_x, test_y, test_batch_size) logger.info("epoch %d, test log loss %s, acc %s" % (i, test_log_loss, test_acc)) iter_loss_list.append(test_log_loss) iter_acc_list.append(test_acc) log_loss_history.append(iter_loss_list) acc_history.append(iter_acc_list) # Log Best Epoch log_loss_history = np.array(log_loss_history) acc_history = np.array(acc_history) # Log Best Batch batch_log_loss_history = np.array(batch_log_loss_history) batch_acc_history = np.array(batch_acc_history) self.log_to_file("Epoch", log_loss_history, acc_history) self.log_to_file("Batch", batch_log_loss_history, batch_acc_history) save_model("model/%s.final.model" % predict_path, self)
'+++++++++++++Calculating batch stats for normalizing+++++++++++++') imagenet_stats = {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]} train_ds, valid_ds = train.get_datasets(path_dogs, human_train, human_valid, stats=imagenet_stats, size=args.img_size) bs = args.batch_size dls = train.get_dls(train_ds, valid_ds, bs=bs) train.display_message( '+++++++++++++Getting Model ready for training+++++++++++++') model = models.ModelTransfer() device = train.get_device() model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = loss_func.CustomLoss(train_ds.dog_human_labeller) recorder = metrics.Recorder() n_epochs = args.n_epochs modelutils.freeze(model.model) train.run(n_epochs, model, optimizer, criterion, dls, device, recorder, max_lr=args.max_lr, env='shell') utils.save_model(model, f'model_transfer_{n_epochs}_{bs}_{args.lr}', train_ds.breed_labeller, train_ds.dog_human_labeller, imagenet_stats)
def save_weights_fnc(epoch, logs): if epoch % save_freq == 0: logger.info("Saving model from epoch " + str(epoch)) save_model(model, optimizer, os.path.join(save_path, "model_last_epoch"))
def training_loop(model, loss_function, metrics, optimizer, config, save_path, datasets, steps_per_epoch, seed, custom_callbacks=[], checkpoint_monitor="val_categorical_accuracy:0", use_tb=False, reload=False, evaluation_freq=1, n_epochs=100, save_freq=1, save_history_every_k_examples=1, load_weights_from="", load_weights_and_optimizer_from="", weight_decay=0, load_classifier=True): if load_weights_and_optimizer_from != "": assert load_weights_from == "" assert load_weights_and_optimizer_from.endswith("h5") logger.info(f"load_weights_and_optimizer_from={load_weights_and_optimizer_from}") _, optimizer = restore_model_and_optimizer(model, optimizer, load_weights_and_optimizer_from) logger.info("Loaded optimizer") model.load_weights(load_weights_and_optimizer_from, by_name=True) model.optimizer = optimizer elif load_weights_from != "": assert load_weights_and_optimizer_from == "" model.load_weights(load_weights_from) if reload: assert load_classifier is True logger.warning("Only custom_callbacks can be reloaded for now") previous_model = model model, optimizer, H, epoch_start = _reload(model, optimizer, save_path, custom_callbacks) del previous_model logger.warning("Changed model reference internally in the training loop!") else: if hasattr(model, "compile"): model.compile(optimizer=optimizer, loss=loss_function, metrics=[m for m in metrics if not isinstance(m, str)]) # FIXME save_model(model, optimizer, os.path.join(save_path, "init_weights")) history_csv_path, history_pkl_path = os.path.join(save_path, "history.csv"), os.path.join(save_path, "history.pkl") logger.info("Removing {} and {}".format(history_pkl_path, history_csv_path)) os.system("rm " + history_pkl_path) os.system("rm " + history_csv_path) H, epoch_start = {}, 0 callbacks = list(custom_callbacks) callbacks += _construct_default_callbacks(model, optimizer, H, save_path, checkpoint_monitor, save_freq, custom_callbacks, use_tb, save_history_every_k_examples) # Configure callbacks for clbk in callbacks: clbk.set_save_path(save_path) clbk.set_optimizer(optimizer) clbk.set_model(model) clbk.set_seed(seed) clbk.set_datasets(datasets) clbk.set_config(config) clbk.set_callbacks(callbacks) _training_loop(model, datasets, optimizer, loss_function, epoch_start, n_epochs, callbacks, steps_per_epoch, train_on_batch=_train_on_batch_optimized, evaluate_model=evaluate, metrics=metrics, evaluation_freq=evaluation_freq, weight_decay=weight_decay) if save_freq != -1: save_model(model, optimizer, os.path.join(save_path, "model_last_epoch"))
def train(self, data, pickle_path=None): """ Train this featurizer on the training set using the following procedure 1. Compute the keypoint descriptors for each image. keypoint descriptors are M-length vectors for each detected keypoint in an image. There can be an arbitrary number of keypoints per image 2. Perform k-means clustering on the set of all descriptors gathered from all images. The descriptors will be divided into K groups (K is specified in the constructor). The distribution of an image's keypoints among these K groups (wich was computed using each keypoint's descriptor) determines the feature vector of that image. 3. For each image, go through the keypoints of that image and increment the element in the zero-initialized feature vector which corresponds to the label of that image. In essence, each keypoint of an image votes on which of K groups it thinks the image belongs to. These votes become the feature vector of that image. Input: data: (N,H,W[,C]) matrix of N training images pickle_path: location of pickle file to save/load model Output: features: (N, K) matrix of K-length feature vectors of N images """ if pickle_path is not None: self.__kmeans = utils.load_model(pickle_path) if self.__kmeans is not None: self.__logger.warning( "No pickle file found at {}".format(pickle_path)) self.__logger.info( "Computing {} descriptors using saved model".format( self.__name.upper())) return self.test(data) # Use multiple processes to calculate descriptors # Use all but one thread if possible to prevent crashes when # using all threads. Pool doesn't like it when there is not much data # so if there is only need for 1 processor we don't pool. Otherwise we # will block indefinitely for some reason. n_images = data.shape[0] features = np.zeros((n_images, self.__vocab_size)) nprocs = (os.cpu_count() - 1) if os.cpu_count() > 1 else 1 nprocs = nprocs if n_images > (nprocs * 20) else 1 self.__logger.info( "Computing {} descriptors using {} processes".format( self.__name.upper(), nprocs)) if nprocs > 1: subsets = np.array_split(data, nprocs, axis=0) pool = Pool(processes=nprocs) pool_args = zip(subsets, itertools.repeat(self.__name)) batch_descriptors = pool.map(_compute_features, pool_args) img_descriptors = np.concatenate([d for d in batch_descriptors]) else: img_descriptors = _compute_features((data, self.__name)) # Reshape data so that we can give k-means a list of descriptors # while maintaining a descriptor->image mapping which we will need # to generate features later img_ids, descriptors = [], [] for i, img_desc in enumerate(img_descriptors): img_ids.extend([i] * len(img_desc)) descriptors.extend([desc for desc in img_desc]) # k-means to determine a feature vector for each image # Use MiniBatchKmeans for speed self.__kmeans = MiniBatchKMeans( batch_size=10, # smaller batch for less memory use n_clusters=self.__vocab_size, init_size=(3 * self.__vocab_size)).fit(descriptors) # We can assume that the image IDs calculated before correspond # to the correct k-means label because MiniBatchKMeans preserves order # Otherwise we would have to use preddict(X) on each image (slow) for img_id, cluster_id in zip(img_ids, self.__kmeans.labels_): features[img_id, cluster_id] += 1 if pickle_path is not None: utils.save_model(self.__kmeans, pickle_path) return features
def train(args, model, optimizer, scheduler, tokenizer,ner_index, *, train_loader, valid_df, valid_loader, epoch_length, n_epochs=None): n_epochs = n_epochs or args.n_epochs run_root = Path('../experiments/' + args.run_root) model_path = run_root / ('tagger_model-%d.pt' % args.fold) best_model_path = run_root / ('best-model-%d.pt' % args.fold) if best_model_path.exists(): state, best_valid_score = load_model(model, best_model_path) start_epoch = state['epoch'] best_epoch = start_epoch else: best_valid_score = 0 start_epoch = 0 best_epoch = 0 step = 0 criterion = CrossEntropyLoss().cuda() report_each = 10000 log = run_root.joinpath('train-%d.log' % args.fold).open('at', encoding='utf8') for epoch in range(start_epoch, start_epoch + n_epochs): model.train() tq = tqdm.tqdm(total=epoch_length) losses = [] mean_loss = 0 device = torch.device("cuda", 0) for i, (ori_sen, token, token_type, start, end, insert_pos, start_ner, end_ner) in enumerate(train_loader): input_mask = (token > 0).to(device) token, input_mask, token_type, start, end, insert_pos, start_ner, end_ner = \ token.to(device), input_mask.to(device), token_type.to(device), start.to( device), end.to(device), insert_pos.to(device), start_ner.to(device), end_ner.to(device) outputs = model(input_ids=token, attention_mask=input_mask, token_type_ids=token_type, start=start, end=end, insert_pos=insert_pos, start_ner=start_ner, end_ner=end_ner) loss = outputs[0] if (i + 1) % args.step == 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() optimizer.zero_grad() scheduler.step() else: with amp.scale_loss(loss, optimizer, delay_unscale=True) as scaled_loss: scaled_loss.backward() tq.update(args.batch_size) losses.append(loss.item() * args.step) mean_loss = np.mean(losses[-report_each:]) tq.set_postfix(loss=f'{mean_loss:.5f}') lr = get_learning_rate(optimizer) tq.set_description(f'Epoch {epoch}, lr {lr:.6f}') if i and i % report_each == 0: write_event(log, step, loss=mean_loss) # break write_event(log, step, epoch=epoch, loss=mean_loss) tq.close() valid_metrics = validate(model, valid_loader, valid_df, args, tokenizer, ner_index) # write_event(log, step, **valid_metrics) current_score = valid_metrics['rouge-1']['f'] if current_score>best_valid_score: print('save success') save_model(model, epoch, step, mean_loss, model_path) best_valid_score = current_score return True
def train(self, model, tr_loader, va_loader, device, adv_train=False): args = self.args logger = self.logger criterion = nn.CrossEntropyLoss() opt = torch.optim.Adam(model.parameters(), args.learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.weight_decay) #scheduler = torch.optim.lr_scheduler.MultiStepLR(opt, # milestones=[2, 4, 6, 7, 8], # gamma=0.1) acc = 0.0 valid_acc = 0.0 best_acc = 0 best_va_acc = 0 running_loss = 0.0 tr_loss_list = [] val_loss_list = [] correct = 0 total = 0 for epoch in range(1, args.max_epoch + 1): model.train() for data, label, paths in tr_loader: data, label = data.to(device), label.to(device) opt.zero_grad() output = model(data) loss = criterion(output, label) loss.backward() opt.step() running_loss += loss.item() _, pred = torch.max(output.data, dim=1) correct += (pred == label).sum().item() total += label.size(0) std_acc = (correct / total) * 100 tr_loss = running_loss / total tr_loss_list.append(tr_loss) if va_loader is not None: model.eval() t1 = time() va_acc, va_loss = self.test(model, va_loader, device, False, True, criterion) va_acc = va_acc * 100.0 val_loss_list.append(va_loss) t2 = time() logger.info('\n'+'='*20 +' evaluation at epoch: %d '%(epoch) \ +'='*20) logger.info( 'train acc: %.3f %%, train loss: %.3f, validation acc: %.3f %%, valid loss: %.3f, spent: %.3f' % (std_acc, tr_loss, va_acc, va_loss, t2 - t1)) logger.info('=' * 28 + ' end of evaluation ' + '=' * 28 + '\n') acc = std_acc valid_acc = va_acc if acc >= best_acc and valid_acc >= best_va_acc: best_acc = acc best_va_acc = valid_acc file_name = os.path.join(args.model_folder, 'checkpoint_%d.pth' % epoch) save_model(model, file_name) #for Pytorch 1.0, opt.step() must be called before scheduler.step() #scheduler.step() plt.plot(tr_loss_list, c='blue', label='Training Loss') plt.plot(val_loss_list, c='green', label='Validation Loss') plt.xticks(range(1, args.max_epoch + 1)) plt.ylim((0, 1)) plt.legend(loc="upper right") plt.savefig(os.path.join(args.model_folder, 'loss_plot.png')) plt.close() print('Best Train Acc: {:4f}, Best Valid Acc: {:4f}'.format( best_acc, best_va_acc))
train.display_message('+++++++++++++Creating DataLoaders+++++++++++++') train_ds, valid_ds = train.get_datasets(path_dogs, human_train, human_valid, stats=batch_stat, size=args.img_size) bs = args.batch_size dls = train.get_dls(train_ds, valid_ds, bs=bs) train.display_message( '+++++++++++++Getting Model ready for training+++++++++++++') model = models.ModelScratch() device = train.get_device() model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = loss_func.CustomLoss(train_ds.dog_human_labeller) recorder = metrics.Recorder() n_epochs = args.n_epochs train.run(n_epochs, model, optimizer, criterion, dls, device, recorder, max_lr=args.max_lr, env='shell') utils.save_model( model, f'model_scratch_{n_epochs}_{recorder.valid_acc_breed[-1].item():.2f}', train_ds.breed_labeller, train_ds.dog_human_labeller, batch_stat)
def save_model_param_to_file(self, filename): to_save = [param.get_value() for param in self.model_params] save_model(filename, model=to_save, compress=True)
def train(cfg, model): criterion = factory.get_criterion(cfg) # optim = torch.optim.Adam(model.parameters(), lr=1e-3) optim = factory.get_optimizer(cfg, model.parameters()) best = { 'loss': float('inf'), 'score': 0.0, 'epoch': -1, } if "resume_from" in cfg.keys() and cfg["resume_from"]: detail = utils.load_model(cfg["resume_from"], model, optim=optim) best.update({ 'loss': detail['loss'], 'score': detail['score'], 'epoch': detail['epoch'], }) # to set lr manually after resumed for param_group in optim.param_groups: param_group['lr'] = cfg["optimizer"]["param"]["lr"] log(f"initial lr {utils.get_lr(optim)}") scheduler, is_reduce_lr = factory.get_scheduler(cfg, optim) log(f"is_reduce_lr: {is_reduce_lr}") loader_train = factory.get_loader_train(cfg) loader_valid = factory.get_loader_valid(cfg) log('train data: loaded %d records' % len(loader_train.dataset)) log('valid data: loaded %d records' % len(loader_valid.dataset)) log('apex %s' % cfg["apex"]) if cfg["apex"]: amp.initialize(model, optim, opt_level='O1') for epoch in range(best['epoch'] + 1, cfg["epoch"]): log(f'\n----- epoch {epoch} -----') run_nn(cfg, 'train', model, loader_train, criterion=criterion, optim=optim, apex=cfg["apex"]) with torch.no_grad(): val = run_nn(cfg, 'valid', model, loader_valid, criterion=criterion) detail = { 'score': val['score'], 'loss': val['loss'], 'epoch': epoch, } if val['loss'] <= best['loss']: best.update(detail) utils.save_model(model, optim, detail, cfg["fold"], output_dir, best=True) utils.save_model(model, optim, detail, cfg["fold"], output_dir) log('[best] ep:%d loss:%.4f score:%.4f' % (best['epoch'], best['loss'], best['score'])) if is_reduce_lr: scheduler.step(val['loss']) # reducelronplateau else: scheduler.step()
def train(self, model, tr_loader, va_loader=None, adv_train=False): args = self.args logger = self.logger opt = torch.optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR(opt, milestones=[100, 150], gamma=0.1) _iter = 0 begin_time = time() for epoch in range(1, args.max_epoch + 1): scheduler.step() for data, label in tr_loader: data, label = tensor2cuda(data), tensor2cuda(label) if adv_train: # When training, the adversarial example is created from a random # close point to the original data point. If in evaluation mode, # just start from the original data point. adv_data = self.attack.perturb(data, label, 'mean', True) # output = model(adv_data, _eval=False) # ????????? don't know if this is the case########### model.train() output = model(adv_data) else: # output = model(data, _eval=False) model.train() output = model(data) loss = F.cross_entropy(output, label) opt.zero_grad() loss.backward() opt.step() if _iter % args.n_eval_step == 0: t1 = time() if adv_train: with torch.no_grad(): model.eval() stand_output = model(data) # stand_output = model(data, _eval=True) pred = torch.max(stand_output, dim=1)[1] # print(pred) std_acc = evaluate(pred.cpu().numpy(), label.cpu().numpy()) * 100 pred = torch.max(output, dim=1)[1] # print(pred) adv_acc = evaluate(pred.cpu().numpy(), label.cpu().numpy()) * 100 else: adv_data = self.attack.perturb(data, label, 'mean', False) with torch.no_grad(): model.eval() adv_output = model(adv_data) # adv_output = model(adv_data, _eval=True) pred = torch.max(adv_output, dim=1)[1] # print(label) # print(pred) adv_acc = evaluate(pred.cpu().numpy(), label.cpu().numpy()) * 100 pred = torch.max(output, dim=1)[1] # print(pred) std_acc = evaluate(pred.cpu().numpy(), label.cpu().numpy()) * 100 t2 = time() print('%.3f' % (t2 - t1)) logger.info( 'epoch: %d, iter: %d, spent %.2f s, tr_loss: %.3f' % (epoch, _iter, time() - begin_time, loss.item())) logger.info( 'standard acc: %.3f %%, robustness acc: %.3f %%' % (std_acc, adv_acc)) # begin_time = time() # if va_loader is not None: # va_acc, va_adv_acc = self.test(model, va_loader, True) # va_acc, va_adv_acc = va_acc * 100.0, va_adv_acc * 100.0 # logger.info('\n' + '='*30 + ' evaluation ' + '='*30) # logger.info('test acc: %.3f %%, test adv acc: %.3f %%, spent: %.3f' % ( # va_acc, va_adv_acc, time() - begin_time)) # logger.info('='*28 + ' end of evaluation ' + '='*28 + '\n') begin_time = time() if _iter % args.n_store_image_step == 0: tv.utils.save_image( torch.cat([data.cpu(), adv_data.cpu()], dim=0), os.path.join(args.log_folder, 'images_%d.jpg' % _iter), nrow=16) if _iter % args.n_checkpoint_step == 0: file_name = os.path.join(args.model_folder, 'checkpoint_%d.pth' % _iter) save_model(model, file_name) _iter += 1 if va_loader is not None: t1 = time() va_acc, va_adv_acc = self.test(model, va_loader, True, False) va_acc, va_adv_acc = va_acc * 100.0, va_adv_acc * 100.0 t2 = time() logger.info('\n'+'='*20 +' evaluation at epoch: %d iteration: %d '%(epoch, _iter) \ +'='*20) logger.info( 'test acc: %.3f %%, test adv acc: %.3f %%, spent: %.3f' % (va_acc, va_adv_acc, t2 - t1)) logger.info('=' * 28 + ' end of evaluation ' + '=' * 28 + '\n')
vmax=run_params['pipeline']['normalisation'][1]) # evaluate performance of encoder / generator model.forward_and_save_one_image( validation[index]['image'].unsqueeze(0), validation[index]['label'], epoch, to_mlflow=log_to_mlflow, is_remote=remote_run, vmin=run_params['pipeline']['normalisation'][0], vmax=run_params['pipeline']['normalisation'][1]) # Checkpoints if 'checkpoint_frequency' in run_params and epoch % run_params[ 'checkpoint_frequency'] == 0: save_model(model, log_to_mlflow=log_to_mlflow, epoch=epoch, is_remote=remote_run) print('=========Training ended==========') # Test performance test_metric = model.evaluate(test_loader, log_to_mlflow=log_to_mlflow) test_metrics.append(test_metric) # Saving save_model(model, log_to_mlflow=log_to_mlflow, is_remote=remote_run) mlflow.end_run() # Averaging metrics for different random-seeds avg_metric = dict(pd.DataFrame(test_metrics).mean()) if log_to_mlflow:
def main(): arg = args() if not os.path.exists(arg.exp_name): os.makedirs(arg.exp_name) assert arg.exp_name.split( '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..." output_dir = arg.exp_name save_scripts_in_exp_dir(output_dir) logger = logging_set(output_dir) logger.info( '\n================ experient name:[{}] ===================\n'.format( arg.exp_name)) os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True np.random.seed(0) torch.manual_seed(0) config = edict(yaml.load(open(arg.cfg, 'r'))) if arg.search: assert arg.search in [ 'None', 'sync', 'random', 'second_order_gradient', 'first_order_gradient' ] config.train.arch_search_strategy = arg.search if arg.batchsize: logger.info("update batchsize to {}".format(arg.batchsize)) config.train.batchsize = arg.batchsize config.num_workers = arg.num_workers print( 'GPU memory : \ntotal | used\n', os.popen( 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' ).read()) logger.info( '------------------------------ configuration ---------------------------' ) logger.info( '\n==> available {} GPUs , use numbers are {} device is {}\n'.format( torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"], torch.cuda.current_device())) # torch.cuda._initialized = True logger.info(pprint.pformat(config)) logger.info( '------------------------------- -------- ----------------------------' ) criterion = MSELoss() Arch = bulid_up_network(config, criterion) if config.train.arch_search_strategy == 'random': logger.info("==>random seed is {}".format(config.train.random_seed)) np.random.seed(config.train.random_seed) torch.manual_seed(config.train.random_seed) Arch.arch_parameters_random_search() if arg.param_flop: Arch._print_info() # dump_input = torch.rand((1,3,128,128)) # graph = SummaryWriter(output_dir+'/log') # graph.add_graph(Arch, (dump_input, )) if len(arg.gpu) > 1: use_multi_gpu = True Arch = torch.nn.DataParallel(Arch).cuda() else: use_multi_gpu = False Arch = Arch.cuda() Search = Search_Arch(Arch.module, config) if use_multi_gpu else Search_Arch( Arch, config) # Arch.module for nn.DataParallel search_strategy = config.train.arch_search_strategy train_queue, arch_queue, valid_queue = Dataloaders(search_strategy, config, arg) #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None! logger.info( "\nNeural Architecture Search strategy is {}".format(search_strategy)) assert search_strategy in [ 'first_order_gradient', 'random', 'None', 'second_order_gradient', 'sync' ] if search_strategy == 'sync': # arch_parameters is also registered to model's parameters # so the weight-optimizer will also update the arch_parameters logger.info( "sync: The arch_parameters is also optimized by weight-optmizer synchronously" ) optimizer = torch.optim.Adam( Arch.parameters(), lr=config.train.w_lr_cosine_begin, ) else: # if search strategy is None,random,second_order_gradient and so on # the arch_parameters will be filtered by the weight-optimizer optimizer = torch.optim.Adam( filter_arch_parameters(Arch), lr=config.train.w_lr_cosine_begin, ) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = config.train.lr_step_size, # gamma = config.train.lr_decay_gamma ) if config.train.scheduler_name == "MultiStepLR": scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.train.LR_STEP, config.train.LR_FACTOR) elif config.train.scheduler_name == "CosineAnnealingLR": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config.train.epoch_end, eta_min=config.train.w_lr_cosine_end) # best_result best = 0 logger.info( "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+==" ) begin, end = config.train.epoch_begin, config.train.epoch_end if arg.load_ckpt: if use_multi_gpu: begin, best = load_ckpt(Arch.module, optimizer, scheduler, output_dir, logger) else: begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir, logger) for epoch in range(begin, end): lr = scheduler.get_lr()[0] logger.info( '==>time:({})--training...... current learning rate is {:.7f}'. format(datetime.datetime.now(), lr)) train( epoch, train_queue, arch_queue, Arch, Search, criterion, optimizer, lr, search_strategy, output_dir, logger, config, arg, ) scheduler.step() eval_results = evaluate(Arch, valid_queue, config, output_dir) if use_multi_gpu: best = save_model(epoch, best, eval_results, Arch.module, optimizer, scheduler, output_dir, logger) else: best = save_model(epoch, best, eval_results, Arch, optimizer, scheduler, output_dir, logger) ## visualize_heatamp if arg.visualize and epoch % 5 == 0: for i in range(len(valid_queue.dataset)): if valid_queue.dataset[i][1] != 185250: # choose an image_id continue print(valid_queue.dataset[i][1]) sample = valid_queue.dataset[i] img = sample[0].unsqueeze(0) #samples = next(iter(valid_dataloader)) #img = samples[0] output = Arch(img) print(img.size(), output.size()) visualize_heatamp(img, output, 'heatmaps', show_img=False) break