def valid(valid_queue, model, criterion): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() with torch.no_grad(): model.eval() for step, (input, target) in enumerate(valid_queue): input = input.cuda() target = target.cuda() logits, _ = model(input) loss = criterion(logits, target) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.data, n) top1.update(prec1.data, n) top5.update(prec5.data, n) if (step + 1) % 100 == 0: logging.info('valid %03d %e %f %f', step + 1, objs.avg, top1.avg, top5.avg) return top1.avg, top5.avg, objs.avg
def infer(valid_queue, model, criterion): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() model.eval() for step, (input, target) in enumerate(valid_queue): input = Variable(input, volatile=True).cuda() target = Variable(target, volatile=True).cuda(async=True) logits = model(input) loss = criterion(logits, target) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) if step % args.report_freq == 0: logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) return top1.avg, objs.avg
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() for step, (input, target) in tqdm(enumerate(train_queue)): model.train() n = input.size(0) input = Variable(input, requires_grad=False) target = Variable(target, requires_grad=False) # get a random minibatch from the search queue with replacement input_search, target_search = next(iter(valid_queue)) input_search = Variable(input_search, requires_grad=False) target_search = Variable(target_search, requires_grad=False) architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=args.unrolled) optimizer.zero_grad() logits = model(input) loss = criterion(logits, target) loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) optimizer.step() prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.data[0], n) top1.update(prec1.data[0], n) top5.update(prec5.data[0], n) if step % args.report_freq == 0: logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) return top1.avg, objs.avg
def infer(valid_queue, model, criterion): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() model.eval() for step, (input, target) in enumerate(valid_queue): input = input.cuda() target = target.cuda(non_blocking=True) with torch.no_grad(): logits = model(input) loss = criterion(logits, target) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.data.item(), n) top1.update(prec1.data.item(), n) top5.update(prec5.data.item(), n) if step % args.report_freq == 0: logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) return top1.avg, objs.avg
def infer(valid_queue, model, criterion, bin_op, report_freq): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() model.eval() bin_op.binarization() with torch.no_grad(): for step, (input, target) in enumerate(valid_queue): input = input.cuda() target = target.cuda() logits = model(input) loss = criterion(logits, target) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) # if step % report_freq == 0: # print("Step: {}, Top1: {}, Top5: {}".format(step, top1.avg, top5.avg)) bin_op.restore() return top1.avg, objs.avg
def infer(test_queue, model, criterion): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() model.eval() for step, (input, target) in enumerate(test_queue): input = input.to(device) target = target.cuda(non_blocking=True) logits, _ = model(input) loss = criterion(logits, target) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.data[0], n) top1.update(prec1.data[0], n) top5.update(prec5.data[0], n) if step % args.report_freq == 0: logging.info('test %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) return top1.avg, objs.avg
def traingraft(args, epoch, train_data, device, rootmodel, graftmodel, criterion, optimizer, scheduler, supernet, choice=None): rootmodel.val() graftmodel.train() train_loss = 0.0 top1 = utils.AvgrageMeter() train_data = tqdm(train_data) eps = args.epochs if supernet == 'supernet': if choice is not None: eps = 50 train_data.set_description('[%s%04d/%04d %s%f]' % ('Epoch:', epoch + 1, eps, 'lr:', scheduler.get_lr()[0])) for step, (inputs, targets) in enumerate(train_data): inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() if supernet == 'supernet': if choice is None: choice = utils.random_choice(args.num_choices, args.layers) outputs = model(inputs, choice) else: outputs = model(inputs) loss = criterion(outputs, targets) # if args.dataset == 'cifar10': loss.backward() # elif args.dataset == 'imagenet': # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() optimizer.step() #model.move_to_cpu(choice) prec1, prec5 = utils.accuracy(outputs, targets, topk=(1, 5)) n = inputs.size(0) top1.update(prec1.item(), n) train_loss += loss.item() postfix = {'train_loss': '%.6f' % (train_loss / (step + 1)), 'train_acc': '%.6f' % top1.avg} train_data.set_postfix(log=postfix)
def validate(args, epoch, val_data, device, model, criterion, supernet=False, choice=None): model.eval() val_loss = 0.0 val_top1 = utils.AvgrageMeter() with torch.no_grad(): for step, (inputs, targets) in enumerate(val_data): inputs, targets = inputs.to(device), targets.to(device) if supernet: if choice == None: choice = utils.random_choice(args.num_choices, args.layers) outputs = model(inputs, choice) else: outputs = model(inputs) loss = criterion(outputs, targets) val_loss += loss.item() prec1, prec5 = utils.accuracy(outputs, targets, topk=(1, 5)) n = inputs.size(0) val_top1.update(prec1.item(), n) print('[Val_Accuracy epoch:%d] val_loss:%f, val_acc:%f' % (epoch + 1, val_loss / (step + 1), val_top1.avg)) return val_top1.avg
def fitness(self, model): # fit = self.population.sum(axis=1).sum(axis=1)#np.random.sample(self.population_number) # print(fit.sum(),np.array( [[seq_creater() for i in range(self.chromosome_s)] for i in range(self.population_number)]).sum(axis=1).sum(axis=1).sum()) total_reward = utils.AvgrageMeter() fit = [] for step, dag in enumerate(self.population): #print(dag) data, target = self.dataloader.next_batch() n = data.size(0) data = data.cuda() target = target.cuda() with torch.no_grad(): logits, aux = model(dag.tolist(), data) #print(dag.tolist()) reward = utils.accuracy(logits, target)[0] fit.append(reward.item()) total_reward.update(reward.item(), n) self.score = np.array(fit) print(self.score.mean()) return np.array(fit)
def train(train_queue, model, criterion, optimizer): F_objs = utils.AvgrageMeter() F_top1 = utils.AvgrageMeter() F_top5 = utils.AvgrageMeter() H_objs = utils.AvgrageMeter() H_top1 = utils.AvgrageMeter() H_top5 = utils.AvgrageMeter() model.train() for step, (input, target) in enumerate(train_queue): n = input.size(0) input = input.cuda() target = target.cuda() optimizer.zero_grad() F_out, H_out = model(input) loss_1 = criterion(F_out, target) loss_2 = criterion(H_out, target) loss = loss_1 + loss_2 loss.backward() optimizer.step() prec1, prec5 = utils.accuracy(F_out, target, topk=(1, 5)) F_objs.update(loss_1.item(), n) F_top1.update(prec1.item(), n) F_top5.update(prec5.item(), n) prec1, prec5 = utils.accuracy(H_out, target, topk=(1, 5)) H_objs.update(loss_2.item(), n) H_top1.update(prec1.item(), n) H_top5.update(prec5.item(), n) if step % args.report_freq == 0: logging.info('train %03d %e %f %f', step, F_objs.avg, F_top1.avg, F_top5.avg) logging.info('train %03d %e %f %f', step, H_objs.avg, H_top1.avg, H_top5.avg) return F_top1.avg, F_top5.avg, H_top1.avg, H_top5.avg, F_objs.avg
def infer(valid_queue, model_1, model_2, criterion): objs_1 = utils.AvgrageMeter() objs_2 = utils.AvgrageMeter() top1_1 = utils.AvgrageMeter() top5_1 = utils.AvgrageMeter() top1_2 = utils.AvgrageMeter() top5_2 = utils.AvgrageMeter() model_1.eval() model_2.eval() for step, (input, target) in enumerate(valid_queue): input = Variable(input, volatile=True).cuda() target = Variable(target, volatile=True).cuda(non_blocking=True) logits_1 = model_1(input) logits_2 = model_2(input) loss_1 = criterion(logits_1, target) loss_2 = criterion(logits_2, target) prec1_1, prec5_1 = utils.accuracy(logits_1, target, topk=(1, 5)) prec1_2, prec5_2 = utils.accuracy(logits_2, target, topk=(1, 5)) n = input.size(0) objs_1.update(loss_1.item(), n) objs_2.update(loss_2.item(), n) top1_1.update(prec1_1.item(), n) top5_1.update(prec5_1.item(), n) top1_2.update(prec1_2.item(), n) top5_2.update(prec5_2.item(), n) if step % args.report_freq == 0: logging.info('Valid %03d %e %e %f %f %f %f', step, objs_1.avg, objs_2.avg, top1_1.avg, top5_1.avg, top1_2.avg, top5_2.avg) return top1_1.avg, objs_1.avg, top1_2.avg, objs_2.avg
def infer(valid_queue, model, model1, criterion): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() objs1 = utils.AvgrageMeter() top1_1 = utils.AvgrageMeter() top5_1 = utils.AvgrageMeter() model.eval() model1.eval() with torch.no_grad(): for step, (input, target) in enumerate(valid_queue): #input = input.cuda() #target = target.cuda(non_blocking=True) input = Variable(input, volatile=True).cuda() target = Variable(target, volatile=True).cuda(non_blocking=True) logits = model(input) loss = criterion(logits, target) logits1 = model1(input) loss1 = criterion(logits1, target) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) prec1, prec5 = utils.accuracy(logits1, target, topk=(1, 5)) n = input.size(0) objs1.update(loss1.item(), n) top1_1.update(prec1.item(), n) top5_1.update(prec5.item(), n) if step % args.report_freq == 0: logging.info('valid 1st %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) logging.info('valid 2nd %03d %e %f %f', step, objs1.avg, top1_1.avg, top5_1.avg) return top1.avg, objs.avg, top1_1.avg, objs1.avg
def infer(valid_queue, model, criterion): F_objs = utils.AvgrageMeter() F_top1 = utils.AvgrageMeter() F_top5 = utils.AvgrageMeter() H_objs = utils.AvgrageMeter() H_top1 = utils.AvgrageMeter() H_top5 = utils.AvgrageMeter() model.eval() with torch.no_grad(): for step, (input, target) in enumerate(valid_queue): input = input.cuda() target = target.cuda() F_out, H_out = model(input) loss_1 = criterion(F_out, target) loss_2 = criterion(H_out, target) n = input.size(0) prec1, prec5 = utils.accuracy(F_out, target, topk=(1, 5)) F_objs.update(loss_1.item(), n) F_top1.update(prec1.item(), n) F_top5.update(prec5.item(), n) prec1, prec5 = utils.accuracy(H_out, target, topk=(1, 5)) H_objs.update(loss_2.item(), n) H_top1.update(prec1.item(), n) H_top5.update(prec5.item(), n) if step % args.report_freq == 0: logging.info('valid %03d %e %f %f', step, F_objs.avg, F_top1.avg, F_top5.avg) logging.info('valid %03d %e %f %f', step, H_objs.avg, H_top1.avg, H_top5.avg) return F_top1.avg, F_top5.avg, H_top1.avg, H_top5.avg, F_objs.avg
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, low_flops, high_flops, backbone, tau): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() hardware_pool = [i for i in range(low_flops, high_flops, 5)] hardware_index = 0 for step, (input, target) in enumerate(train_queue): model.train() n = input.size(0) input = Variable(input, requires_grad=False).cuda() target = Variable(target, requires_grad=False).cuda(async=True) # get a random minibatch from the search queue with replacement input_search, target_search = next(iter(valid_queue)) input_search = Variable(input_search, requires_grad=False).cuda() target_search = Variable(target_search, requires_grad=False).cuda(async=True) target_hc = torch.tensor(hardware_pool[hardware_index]+3*(random.random()-0.5), dtype=torch.float32).view(-1, 1) target_hc = target_hc.cuda() logger.info("Target hc : {}".foramt(target_hc.item())) backbone = backbone.cuda() normalalize_target_hc = min_max_normalize(high_flops, low_flops, target_hc) arch_param = generator(backbone, normalalize_target_hc) arch_param = arch_param.reshape(-1, arch_param.size(-1)) alphas_normal = F.gumbel_softmax(arch_param[0], dim=-1, tau) alphas_reduce = F.gumbel_softmax(arch_param[1], dim=-1, tau) gen_hc = lookup_table.get_model_macs(alphas_normal, alphas_reduce) logger.info("Generator hc : {}".format(gen_hc)) hc_loss = cal_hc_loss(gen_hc.cuda(), target_hc.item(), ALPHA, LOSS_PENALTY) hardware_index += 1 if hardware_index == len(hardware_pool): hardware_index = 0 random.shuffle(hardware_pool) #architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=args.unrolled) self.g_optimizer.zero_grad() g_loss = self.model._loss(input_valid, target_valid) loss = g_loss + hc_loss g_loss.backward() self.g_optimizer.step() # ========================================================================= optimizer.zero_grad() logits = model(input) loss = criterion(logits, target) loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) optimizer.step() prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.data[0], n) top1.update(prec1.data[0], n) top5.update(prec5.data[0], n) if step % args.report_freq == 0: logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) return top1.avg, objs.avg
def train(train_queue, valid_queue, external_queue, model, model1, architect, criterion, optimizer, optimizer1, lr, lr1): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() objs1 = utils.AvgrageMeter() top1_1 = utils.AvgrageMeter() top5_1 = utils.AvgrageMeter() valid_queue_iter = iter(valid_queue) external_queue_iter = iter(external_queue) for step, (input, target) in enumerate(train_queue): model.train() model1.train() n = input.size(0) input = input.cuda() target = target.cuda(non_blocking=True) # get a random minibatch from the search queue with replacement try: input_search, target_search = next(valid_queue_iter) except: valid_queue_iter = iter(valid_queue) input_search, target_search = next(valid_queue_iter) try: input_external, target_external = next(external_queue_iter) except: external_queue_iter = iter(external_queue) input_external, target_external = next(external_queue_iter) # input_external, target_external = next(iter(external_queue)) # input_search, target_search = next(iter(valid_queue)) input_search = input_search.cuda() target_search = target_search.cuda(non_blocking=True) input_external = input_external.cuda() target_external = target_external.cuda(non_blocking=True) # import ipdb; ipdb.set_trace() architect.step(input, target, input_external, target_external, input_search, target_search, lr, lr1, optimizer, optimizer1, unrolled=args.unrolled) optimizer.zero_grad() optimizer1.zero_grad() logits = model(input) logits1 = model1(input) loss = criterion(logits, target) loss1 = criterion(logits1, target) external_out = model(input_external) external_out1 = model1(input_external) if args.debug: with torch.no_grad(): softlabel_other = F.softmax(external_out, 1) softlabel_other = softlabel_other.detach() else: softlabel_other = F.softmax(external_out, 1) loss_soft = softXEnt(external_out1, softlabel_other) if args.debug: with torch.no_grad(): softlabel_other1 = F.softmax(external_out1, 1) softlabel_other1 = softlabel_other1.detach() else: softlabel_other1 = F.softmax(external_out1, 1) loss_soft1 = softXEnt(external_out, softlabel_other1) loss_all = loss + loss1 + args.weight_lambda * (loss_soft1 + loss_soft) loss_all.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) nn.utils.clip_grad_norm_(model1.parameters(), args.grad_clip) optimizer.step() optimizer1.step() prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) prec1, prec5 = utils.accuracy(logits1, target, topk=(1, 5)) objs1.update(loss1.item(), n) top1_1.update(prec1.item(), n) top5_1.update(prec5.item(), n) if step % args.report_freq == 0: logging.info('train 1st %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) logging.info('train 2nd %03d %e %f %f', step, objs1.avg, top1_1.avg, top5_1.avg) return top1.avg, objs.avg, top1_1.avg, objs1.avg
def train(train_queue, valid_queue, model, network_params, criterion, optimizer, optimizer_a, lr, train_arch=True): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() global baseline for step, (input, target) in enumerate(train_queue): model.train() n = input.size(0) input = input.cuda() target = target.cuda(non_blocking=True) # if step % 10 ==0: # 每10个batch ,进行一次RL , 一次 采10次网络 if 1: # 每10个batch ,进行一次RL , 一次 采10次网络 if train_arch: # In the original implementation of DARTS, it is input_search, target_search = next(iter(valid_queue), which slows down # the training when using PyTorch 0.4 and above. try: input_search, target_search = next(valid_queue_iter) except: valid_queue_iter = iter(valid_queue) input_search, target_search = next(valid_queue_iter) input_search = input_search.cuda() target_search = target_search.cuda(non_blocking=True) normal_grad_buffer = [] reduce_grad_buffer = [] reward_buffer = [] for batch_idx in range(rl_batch_size): # 多采集几个网络,测试 # sample the submodel get_cur_model(model) # cur_sub_model.cuda() # cur_sub_model.drop_path_prob = 0 # validat the sub_model with torch.no_grad(): # logits, _ = cur_sub_model(input_search) logits = model(input_search) prec1, _ = utils.accuracy(logits, target_search, topk=(1, 5)) if model.module._arch_parameters[0].grad is not None: model.module._arch_parameters[0].grad.data.zero_() if model.module._arch_parameters[1].grad is not None: model.module._arch_parameters[1].grad.data.zero_() obj_term = 0 for i in range(14): obj_term = obj_term + model.module.normal_log_prob[i] obj_term = obj_term + model.module.reduce_log_prob[i] loss_term = -obj_term # backward loss_term.backward() # take out gradient dict normal_grad_list = [] reduce_grad_list = [] normal_grad_buffer.append( model.module._arch_parameters[0].grad.data.clone()) reduce_grad_buffer.append( model.module._arch_parameters[1].grad.data.clone()) reward_buffer.append(prec1) avg_reward = sum(reward_buffer) / rl_batch_size if baseline == 0: baseline = avg_reward else: baseline += baseline_decay_weight * (avg_reward - baseline) # for idx in range(14): model.module._arch_parameters[0].grad.data.zero_() model.module._arch_parameters[1].grad.data.zero_() for j in range(rl_batch_size): model.module._arch_parameters[0].grad.data += ( reward_buffer[j] - baseline) * normal_grad_buffer[j] model.module._arch_parameters[1].grad.data += ( reward_buffer[j] - baseline) * reduce_grad_buffer[j] model.module._arch_parameters[0].grad.data /= rl_batch_size model.module._arch_parameters[1].grad.data /= rl_batch_size # apply gradients optimizer_a.step() logging.info( 'REINFORCE [step %d]\t\tMean Reward %.4f\tBaseline %d', step, avg_reward, baseline) model.module.restore_super_net() # print(model.module._arch_parameters[0]) # print(model.module._arch_parameters[1]) if not train_arch: # if 0: optimizer.zero_grad() logits = model(input) loss = criterion(logits, target) loss.backward() nn.utils.clip_grad_norm_(network_params, args.grad_clip) optimizer.step() prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.data.item(), n) top1.update(prec1.data.item(), n) top5.update(prec5.data.item(), n) if step % args.report_freq == 0: logging.info('TRAIN Step: %03d Objs: %e R1: %f R5: %f', step, objs.avg, top1.avg, top5.avg) else: top1.avg = 0 objs.avg = 0 return top1.avg, objs.avg
def train(train_queue, model, cnn_optimizer, grad_scalar, global_step, warmup_iters, writer, logging): alpha_i = utils.kl_balancer_coeff(num_scales=model.num_latent_scales, groups_per_scale=model.groups_per_scale, fun='square') nelbo = utils.AvgrageMeter() model.train() for step, x in enumerate(train_queue): x = x[0] if len(x) > 1 else x x = x.half().cuda() # change bit length x = utils.pre_process(x, args.num_x_bits) # warm-up lr if global_step < warmup_iters: lr = args.learning_rate * float(global_step) / warmup_iters for param_group in cnn_optimizer.param_groups: param_group['lr'] = lr # sync parameters, it may not be necessary if step % 100 == 0: utils.average_params(model.parameters(), args.distributed) cnn_optimizer.zero_grad() with autocast(): logits, log_q, log_p, kl_all, kl_diag = model(x) output = model.decoder_output(logits) kl_coeff = utils.kl_coeff( global_step, args.kl_anneal_portion * args.num_total_iter, args.kl_const_portion * args.num_total_iter, args.kl_const_coeff) recon_loss = utils.reconstruction_loss(output, x, crop=model.crop_output) balanced_kl, kl_coeffs, kl_vals = utils.kl_balancer( kl_all, kl_coeff, kl_balance=True, alpha_i=alpha_i) nelbo_batch = recon_loss + balanced_kl loss = torch.mean(nelbo_batch) norm_loss = model.spectral_norm_parallel() bn_loss = model.batchnorm_loss() # get spectral regularization coefficient (lambda) if args.weight_decay_norm_anneal: assert args.weight_decay_norm_init > 0 and args.weight_decay_norm > 0, 'init and final wdn should be positive.' wdn_coeff = (1. - kl_coeff) * np.log( args.weight_decay_norm_init) + kl_coeff * np.log( args.weight_decay_norm) wdn_coeff = np.exp(wdn_coeff) else: wdn_coeff = args.weight_decay_norm loss += norm_loss * wdn_coeff + bn_loss * wdn_coeff grad_scalar.scale(loss).backward() utils.average_gradients(model.parameters(), args.distributed) grad_scalar.step(cnn_optimizer) grad_scalar.update() nelbo.update(loss.data, 1) if (global_step + 1) % 100 == 0: if (global_step + 1) % 1000 == 0: # reduced frequency n = int(np.floor(np.sqrt(x.size(0)))) x_img = x[:n * n] output_img = output.mean if isinstance( output, torch.distributions.bernoulli.Bernoulli ) else output.sample() output_img = output_img[:n * n] x_tiled = utils.tile_image(x_img, n) output_tiled = utils.tile_image(output_img, n) in_out_tiled = torch.cat((x_tiled, output_tiled), dim=2) writer.add_image('reconstruction', in_out_tiled, global_step) # norm writer.add_scalar('train/norm_loss', norm_loss, global_step) writer.add_scalar('train/bn_loss', bn_loss, global_step) writer.add_scalar('train/norm_coeff', wdn_coeff, global_step) utils.average_tensor(nelbo.avg, args.distributed) logging.info('train %d %f', global_step, nelbo.avg) writer.add_scalar('train/nelbo_avg', nelbo.avg, global_step) writer.add_scalar( 'train/lr', cnn_optimizer.state_dict()['param_groups'][0]['lr'], global_step) writer.add_scalar('train/nelbo_iter', loss, global_step) writer.add_scalar('train/kl_iter', torch.mean(sum(kl_all)), global_step) writer.add_scalar( 'train/recon_iter', torch.mean( utils.reconstruction_loss(output, x, crop=model.crop_output)), global_step) writer.add_scalar('kl_coeff/coeff', kl_coeff, global_step) total_active = 0 for i, kl_diag_i in enumerate(kl_diag): utils.average_tensor(kl_diag_i, args.distributed) num_active = torch.sum(kl_diag_i > 0.1).detach() total_active += num_active # kl_ceoff writer.add_scalar('kl/active_%d' % i, num_active, global_step) writer.add_scalar('kl_coeff/layer_%d' % i, kl_coeffs[i], global_step) writer.add_scalar('kl_vals/layer_%d' % i, kl_vals[i], global_step) writer.add_scalar('kl/total_active', total_active, global_step) global_step += 1 utils.average_tensor(nelbo.avg, args.distributed) return nelbo.avg, global_step
def train_and_evaluate_top_on_imagenet(archs, train_queue, valid_queue): res = [] train_criterion = nn.CrossEntropyLoss().cuda() eval_criterion = nn.CrossEntropyLoss().cuda() objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() for i, arch in enumerate(archs): objs.reset() top1.reset() top5.reset() logging.info('Train and evaluate the {} arch'.format(i + 1)) model = NASNetworkImageNet(args, 1000, args.child_layers, args.child_nodes, args.child_channels, 1.0, 1.0, True, args.steps, arch) model = model.cuda() model.train() optimizer = torch.optim.SGD( model.parameters(), args.child_lr, momentum=0.9, weight_decay=args.child_l2_reg, ) for step, (input, target) in enumerate(train_queue): input = input.cuda().requires_grad_() target = target.cuda() optimizer.zero_grad() # sample an arch to train logits, aux_logits = model(input, step) loss = train_criterion(logits, target) if aux_logits is not None: aux_loss = train_criterion(aux_logits, target) loss += 0.4 * aux_loss loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.child_grad_bound) optimizer.step() prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.data, n) top1.update(prec1.data, n) top5.update(prec5.data, n) if (step + 1) % 100 == 0: logging.info('Train %03d loss %e top1 %f top5 %f', step + 1, objs.avg, top1.avg, top5.avg) if step + 1 == 500: break objs.reset() top1.reset() top5.reset() with torch.no_grad(): model.eval() for step, (input, target) in enumerate(valid_queue): input = input.cuda() target = target.cuda() logits, _ = model(input) loss = eval_criterion(logits, target) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.data, n) top1.update(prec1.data, n) top5.update(prec5.data, n) if (step + 1) % 100 == 0: logging.info('valid %03d %e %f %f', step + 1, objs.avg, top1.avg, top5.avg) res.append(top1.avg) return res
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, epoch, grad_clip, report_lines, unrolled, criterion_weight=1.0, l1_weight=-1, l2_weight=-1): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() criterion_loss = torch.zeros(1) l1_loss = torch.zeros(1) l2_loss = torch.zeros(1) for step, (input, target) in enumerate(train_queue): model.train() n = input.size(0) input = input.cuda() target = target.cuda(non_blocking=True) # get a random minibatch from the search queue with replacement # input_search, target_search = next(iter(valid_queue)) try: input_search, target_search = next(valid_queue_iter) except: valid_queue_iter = iter(valid_queue) input_search, target_search = next(valid_queue_iter) input_search = input_search.cuda() target_search = target_search.cuda(non_blocking=True) if epoch >= 15: architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=unrolled) optimizer.zero_grad() logits = model(input) criterion_loss = criterion(logits, target) loss = criterion_weight * criterion_loss if l1_weight >= 0: l1_loss = param_loss(model, nn.L1Loss(reduction='sum')) loss += l1_weight * l1_loss if l2_weight >= 0: l2_loss = param_loss(model, nn.MSELoss(reduction='sum')) loss += l2_weight * l2_loss loss.backward() nn.utils.clip_grad_norm_(model.parameters(), grad_clip) optimizer.step() prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.data.item(), n) top1.update(prec1.data.item(), n) top5.update(prec5.data.item(), n) if step % (len(train_queue) // report_lines) == 0: log.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) return top1.avg, objs.avg, l1_loss, l2_loss, criterion_loss
def train(self, epoch, logging): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() grad = utils.AvgrageMeter() normal_resource_gradient = 0 reduce_resource_gradient = 0 normal_loss_gradient = 0 reduce_loss_gradient = 0 normal_total_gradient = 0 reduce_total_gradient = 0 loss_alpha = None count = 0 for step, (input, target) in enumerate(self.train_queue): if self.args.alternate_update: if step % 2 == 0: self.update_theta = True self.update_alpha = False else: self.update_theta = False self.update_alpha = True n = input.size(0) input = input.to(self.device) target = target.to(self.device, non_blocking=True) if self.args.snas: logits, logits_aux, penalty, op_normal, op_reduce = self.model( input) error_loss = self.criterion(logits, target) if self.args.auxiliary: loss_aux = self.criterion(logits_aux, target) error_loss += self.args.auxiliary_weight * loss_aux if self.args.dsnas: logits, error_loss, loss_alpha, penalty = self.model( input, target, self.criterion) num_normal = self.model.num_normal num_reduce = self.model.num_reduce normal_arch_entropy = self.model._arch_entropy( self.model.normal_log_alpha) reduce_arch_entropy = self.model._arch_entropy( self.model.reduce_log_alpha) if self.args.resource_efficient: if self.args.method == 'policy_gradient': resource_penalty = (penalty[2]) / 6 + self.args.ratio * ( penalty[7]) / 2 log_resource_penalty = ( penalty[35]) / 6 + self.args.ratio * (penalty[36]) / 2 elif self.args.method == 'reparametrization': resource_penalty = (penalty[26]) / 6 + self.args.ratio * ( penalty[25]) / 2 log_resource_penalty = ( penalty[37]) / 6 + self.args.ratio * (penalty[38]) / 2 elif self.args.method == 'discrete': resource_penalty = (penalty[28]) / 6 + self.args.ratio * ( penalty[27]) / 2 log_resource_penalty = ( penalty[39]) / 6 + self.args.ratio * (penalty[40]) / 2 elif self.args.method == 'none': # TODo resource_penalty = torch.zeros(1).cuda() log_resource_penalty = torch.zeros(1).cuda() else: logging.info( "wrongly input of method, please re-enter --method from 'policy_gradient', 'discrete', " "'reparametrization', 'none'") sys.exit(1) else: resource_penalty = torch.zeros(1).cuda() log_resource_penalty = torch.zeros(1).cuda() if self.args.log_penalty: resource_loss = self.model._resource_lambda * log_resource_penalty else: resource_loss = self.model._resource_lambda * resource_penalty if self.args.loss: if self.args.snas: loss = resource_loss.clone() + error_loss.clone() elif self.args.dsnas: loss = resource_loss.clone() else: loss = resource_loss.clone() + -child_coef * ( torch.log(normal_one_hot_prob) + torch.log(reduce_one_hot_prob)).sum() else: if self.args.snas or self.args.dsnas: loss = error_loss.clone() if self.args.distributed: loss.div_(self.world_size) error_loss.div_(self.world_size) resource_loss.div_(self.world_size) if self.args.dsnas: loss_alpha.div_(self.world_size) # logging gradient count += 1 if self.args.resource_efficient: self.optimizer.zero_grad() self.arch_optimizer.zero_grad() resource_loss.backward(retain_graph=True) if not self.args.random_sample: normal_resource_gradient += self.model.normal_log_alpha.grad reduce_resource_gradient += self.model.reduce_log_alpha.grad if self.args.snas: self.optimizer.zero_grad() self.arch_optimizer.zero_grad() error_loss.backward(retain_graph=True) if not self.args.random_sample: normal_loss_gradient += self.model.normal_log_alpha.grad reduce_loss_gradient += self.model.reduce_log_alpha.grad self.optimizer.zero_grad() self.arch_optimizer.zero_grad() if self.args.snas or not self.args.random_sample and not self.args.dsnas: loss.backward() if not self.args.random_sample: normal_total_gradient += self.model.normal_log_alpha.grad reduce_total_gradient += self.model.reduce_log_alpha.grad if self.args.distributed: reduce_tensorgradients(self.model.parameters(), sync=True) nn.utils.clip_grad_norm_([ param for name, param in self.model.named_parameters() if name != 'normal_log_alpha' and name != 'reduce_log_alpha' ], self.args.grad_clip) arch_grad_norm = nn.utils.clip_grad_norm_([ param for name, param in self.model.named_parameters() if name == 'normal_log_alpha' or name == 'reduce_log_alpha' ], 10.) else: nn.utils.clip_grad_norm_(self.model.parameters(), self.args.grad_clip) arch_grad_norm = nn.utils.clip_grad_norm_( self.model.arch_parameters(), 10.) grad.update(arch_grad_norm) if not self.args.fix_weight and self.update_theta: self.optimizer.step() self.optimizer.zero_grad() if not self.args.random_sample and self.update_alpha: self.arch_optimizer.step() self.arch_optimizer.zero_grad() if self.rank == 0: self.logger.add_scalar( "iter_train_loss", error_loss, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "normal_arch_entropy", normal_arch_entropy, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reduce_arch_entropy", reduce_arch_entropy, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "total_arch_entropy", normal_arch_entropy + reduce_arch_entropy, step + len(self.train_queue.dataset) * epoch) if self.args.dsnas: #reward_normal_edge self.logger.add_scalar( "reward_normal_edge_0", self.model.normal_edge_reward[0], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_1", self.model.normal_edge_reward[1], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_2", self.model.normal_edge_reward[2], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_3", self.model.normal_edge_reward[3], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_4", self.model.normal_edge_reward[4], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_5", self.model.normal_edge_reward[5], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_6", self.model.normal_edge_reward[6], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_7", self.model.normal_edge_reward[7], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_8", self.model.normal_edge_reward[8], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_9", self.model.normal_edge_reward[9], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_10", self.model.normal_edge_reward[10], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_11", self.model.normal_edge_reward[11], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_12", self.model.normal_edge_reward[12], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_normal_edge_13", self.model.normal_edge_reward[13], step + len(self.train_queue.dataset) * epoch) #reward_reduce_edge self.logger.add_scalar( "reward_reduce_edge_0", self.model.reduce_edge_reward[0], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_1", self.model.reduce_edge_reward[1], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_2", self.model.reduce_edge_reward[2], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_3", self.model.reduce_edge_reward[3], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_4", self.model.reduce_edge_reward[4], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_5", self.model.reduce_edge_reward[5], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_6", self.model.reduce_edge_reward[6], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_7", self.model.reduce_edge_reward[7], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_8", self.model.reduce_edge_reward[8], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_9", self.model.reduce_edge_reward[9], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_10", self.model.reduce_edge_reward[10], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_11", self.model.reduce_edge_reward[11], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_12", self.model.reduce_edge_reward[12], step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "reward_reduce_edge_13", self.model.reduce_edge_reward[13], step + len(self.train_queue.dataset) * epoch) #policy size self.logger.add_scalar( "iter_normal_size_policy", penalty[2] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_size_policy", penalty[7] / num_reduce, step + len(self.train_queue.dataset) * epoch) # baseline: discrete_probability self.logger.add_scalar( "iter_normal_size_baseline", penalty[3] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_flops_baseline", penalty[5] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_mac_baseline", penalty[6] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_size_baseline", penalty[8] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_flops_baseline", penalty[9] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_mac_baseline", penalty[10] / num_reduce, step + len(self.train_queue.dataset) * epoch) # R - median(R) self.logger.add_scalar( "iter_normal_size-avg", penalty[60] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_flops-avg", penalty[61] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_mac-avg", penalty[62] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_size-avg", penalty[63] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_flops-avg", penalty[64] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_mac-avg", penalty[65] / num_reduce, step + len(self.train_queue.dataset) * epoch) # lnR - ln(median) self.logger.add_scalar( "iter_normal_ln_size-ln_avg", penalty[66] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_ln_flops-ln_avg", penalty[67] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_ln_mac-ln_avg", penalty[68] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_ln_size-ln_avg", penalty[69] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_ln_flops-ln_avg", penalty[70] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_ln_mac-ln_avg", penalty[71] / num_reduce, step + len(self.train_queue.dataset) * epoch) ''' self.logger.add_scalar("iter_normal_size_normalized", penalty[17] / 6, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar("iter_normal_flops_normalized", penalty[18] / 6, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar("iter_normal_mac_normalized", penalty[19] / 6, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar("iter_reduce_size_normalized", penalty[20] / 2, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar("iter_reduce_flops_normalized", penalty[21] / 2, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar("iter_reduce_mac_normalized", penalty[22] / 2, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar("iter_normal_penalty_normalized", penalty[23] / 6, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar("iter_reduce_penalty_normalized", penalty[24] / 2, step + len(self.train_queue.dataset) * epoch) ''' # Monte_Carlo(R_i) self.logger.add_scalar( "iter_normal_size_mc", penalty[29] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_flops_mc", penalty[30] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_mac_mc", penalty[31] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_size_mc", penalty[32] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_flops_mc", penalty[33] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_mac_mc", penalty[34] / num_reduce, step + len(self.train_queue.dataset) * epoch) # log(|R_i|) self.logger.add_scalar( "iter_normal_log_size", penalty[41] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_log_flops", penalty[42] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_log_mac", penalty[43] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_log_size", penalty[44] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_log_flops", penalty[45] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_log_mac", penalty[46] / num_reduce, step + len(self.train_queue.dataset) * epoch) # log(P)R_i self.logger.add_scalar( "iter_normal_logP_size", penalty[47] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_logP_flops", penalty[48] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_logP_mac", penalty[49] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_logP_size", penalty[50] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_logP_flops", penalty[51] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_logP_mac", penalty[52] / num_reduce, step + len(self.train_queue.dataset) * epoch) # log(P)log(R_i) self.logger.add_scalar( "iter_normal_logP_log_size", penalty[53] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_logP_log_flops", penalty[54] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_normal_logP_log_mac", penalty[55] / num_normal, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_logP_log_size", penalty[56] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_logP_log_flops", penalty[57] / num_reduce, step + len(self.train_queue.dataset) * epoch) self.logger.add_scalar( "iter_reduce_logP_log_mac", penalty[58] / num_reduce, step + len(self.train_queue.dataset) * epoch) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) if self.args.distributed: loss = loss.detach() dist.all_reduce(error_loss) dist.all_reduce(prec1) dist.all_reduce(prec5) prec1.div_(self.world_size) prec5.div_(self.world_size) #dist_util.all_reduce([loss, prec1, prec5], 'mean') objs.update(error_loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) if step % self.args.report_freq == 0 and self.rank == 0: logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) self.logger.add_scalar( "iter_train_top1_acc", top1.avg, step + len(self.train_queue.dataset) * epoch) if self.rank == 0: logging.info('-------resource gradient--------') logging.info(normal_resource_gradient / count) logging.info(reduce_resource_gradient / count) logging.info('-------loss gradient--------') logging.info(normal_loss_gradient / count) logging.info(reduce_loss_gradient / count) logging.info('-------total gradient--------') logging.info(normal_total_gradient / count) logging.info(reduce_total_gradient / count) return top1.avg, loss, error_loss, loss_alpha
def train(train_queue, model, margin, criterion, optimizer, epoch): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() batch_time = utils.AvgrageMeter() model.train() for step, (input, target) in enumerate(train_queue): target = target.cuda(non_blocking=True) input = input.cuda(non_blocking=True) b_start = time.time() optimizer.zero_grad() logits = model(input) thetas = margin(logits, target) loss = criterion(thetas, target) if args.auxiliary: loss_aux = criterion(logits_aux, target) loss += args.auxiliary_weight * loss_aux loss.backward() # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() batch_time.update(time.time() - b_start) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) objs.update(loss.data.item(), n) top1.update(prec1.data.item(), n) top5.update(prec5.data.item(), n) if step % args.report_freq == 0: end_time = time.time() if step == 0: duration = 0 start_time = time.time() else: duration = end_time - start_time start_time = time.time() logging.info( 'TRAIN Step: %03d Objs: %e R1: %f R5: %f Duration: %ds BTime: %.3fs', step, objs.avg, top1.avg, top5.avg, duration, batch_time.avg) if step % 5000 == 0: valid_acc_top1 = infer(data_loaders, dataset, model, margin, epoch) global best_acc_top1 is_best = False if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True state = { 'epoch': epoch + 1, 'model': model.module.state_dict(), 'margin': margin.module.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer': optimizer.state_dict(), } if is_best: filename = os.path.join('./', 'best_model.pth.tar') torch.save(state, filename) torch.save(model.state_dict(), './model.pt') torch.save(margin.state_dict(), './margin.pt') filename = os.path.join('./', 'checkpoint.pth.tar') torch.save(state, filename) else: filename = os.path.join('./', 'checkpoint.pth.tar') torch.save(state, filename) return top1.avg, objs.avg
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() model.sample() train_queue.dataset.weights_index = model.ops_weights_b train_queue.dataset.probabilities_index = model.probabilities_b for step, (input, target) in enumerate(train_queue): model.train() model.set_augmenting(True) n = input[0].size(0) # input = Variable(input, requires_grad=False).cuda() # target = Variable(target, requires_grad=False).cuda(async=True) # input = [Variable(img, requires_grad=False).cuda() for img in input] input = Variable(input, requires_grad=False).cuda() target = Variable(target, requires_grad=False).cuda(non_blocking=True) # trans_images_list = [ [Variable(trans_image, requires_grad=False).cuda() # for trans_image in trans_images] # for trans_images in trans_images_list] # get a random minibatch from the search queue with replacement input_search, target_search = next(iter(valid_queue)) input_search = Variable(input_search, requires_grad=False).cuda() target_search = Variable(target_search, requires_grad=False).cuda(non_blocking=True) # input_search = Variable(input_search, requires_grad=False).cuda() # target_search = Variable(target_search, requires_grad=False).cuda(async=True) architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=args.unrolled) optimizer.zero_grad() logits = model(input) loss = criterion(logits, target) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() prec1, prec5 = utils.accuracy(logits.detach(), target.detach(), topk=(1, 5)) # objs.update(loss.data[0], n) # top1.update(prec1.data[0], n) # top5.update(prec5.data[0], n) objs.update(loss.detach().item(), n) top1.update(prec1.detach().item(), n) top5.update(prec5.detach().item(), n) if step % args.report_freq == 0: logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) model.sample() # train_queue.dataset.weights_index = model.sample_ops_weights_index # train_queue.dataset.probabilities_index = model.sample_probabilities_index train_queue.dataset.weights_index = model.ops_weights_b train_queue.dataset.probabilities_index = model.probabilities_b return top1.avg, objs.avg
def train(epoch, train_queue, valid_queue, model, architect, criterion, optimizer, metrics, scheduler, analyser): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() lr = scheduler.lr_vector layers_todo = metrics.layers_index_todo for step, (input, target) in enumerate(train_queue): # one mini-batch logging.info('train mini batch %03d', step) model.train() n = input.size(0) # input = Variable(input, requires_grad=False).cuda() # target = Variable(target, requires_grad=False).cuda(async=True) input = Variable(input, requires_grad=False).to(device) target = Variable(target, requires_grad=False).to(device) # get a random minibatch from the search queue with replacement input_search, target_search = next(iter(valid_queue)) # input_search = Variable(input_search, requires_grad=False).cuda() # target_search = Variable(target_search, requires_grad=False).cuda(async=True) input_search = Variable(input_search, requires_grad=False).to(device) target_search = Variable(target_search, requires_grad=False).to(device) logging.info('update arch...') architect.step(input, target, input_search, target_search, lr, layers_todo, optimizer, unrolled=args.unrolled) logging.info('update weights...') optimizer.zero_grad() """gdas""" logits = model.forward(input, gumbel=args.gumbel) loss = criterion(logits, target) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) # optimizer.step() ################################################################################ # AdaS: update optimizer optimizer.step(layers_todo, scheduler.lr_vector) ################################################################################ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) if step % args.report_freq == 0: logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) if args.compute_hessian: _data_loader = deepcopy(train_queue) input, target = next(iter(_data_loader)) # input = Variable(input, requires_grad=False).cuda() # target = Variable(target, requires_grad=False).cuda(async=True) input = Variable(input, requires_grad=False).to(device) target = Variable(target, requires_grad=False).to(device) # get gradient information # param_grads = [p.grad for p in model.parameters() if p.grad is not None] # param_grads = torch.cat([x.view(-1) for x in param_grads]) # param_grads = param_grads.cpu().data.numpy() # grad_norm = np.linalg.norm(param_grads) # gradient_vector = torch.cat([x.view(-1) for x in gradient_vector]) # grad_norm = LA.norm(gradient_vector.cpu()) # logging.info('\nCurrent grad norm based on Train Dataset: %.4f', # grad_norm) # logging.info('Compute Hessian start') H = analyser.compute_Hw(input, target, input_search, target_search, lr, layers_todo, optimizer, unrolled=False) # g = analyser.compute_dw(input, target, input_search, target_search, # lr, layers_todo, optimizer, unrolled=False) # g = torch.cat([x.view(-1) for x in g]) del _data_loader # logging.info('Compute Hessian finished') # HESSIAN_STATISTICS[f'hessian_epoch{epoch}'] = weights_normal[:, 0] hessian_file = "../save_data/hessian_adas_c100_{0}_epoch_{1}".format( args.file_name, epoch) np.save(hessian_file, H.cpu().data.numpy()) # logging.info('Writing Hessian finished') return top1.avg, objs.avg
train=False, download=True, transform=train_transform) num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(0.5 * num_train)) train_queue = torch.utils.data.DataLoader( train_data, batch_size=64, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:]), pin_memory=True, num_workers=2) top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() for step, (input, target) in enumerate(train_queue): input = Variable(input).cuda() target = Variable(target).cuda() input_pert = ifgsm(model, input, target, epsilon=args.eps, niters=args.niters, learning_rate=args.adv_rate) input_pert = input_pert.detach() logits = model(input_pert)
def infer(valid_queue, model, criterion): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() model.eval() preds = np.asarray([]) targets = np.asarray([]) for step, (input, target) in enumerate(valid_queue): #input = input.cuda() #target = target.cuda(non_blocking=True) input = Variable(input, volatile=True).cuda() target = Variable(target, volatile=True).cuda(async=True) logits = model(input) loss = criterion(logits, target) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) #objs.update(loss.data[0], n) #top1.update(prec1.data[0], n) #top5.update(prec5.data[0], n) objs.update(loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) #minha alteracao output = logits topk = (1, 3) maxk = max(topk) batch_size = target.size(0) _, predicted = torch.max(output.data, 1) #minha alteracao preds = np.concatenate((preds, predicted.cpu().numpy().ravel())) #targets = np.concatenate((targets,target.cpu().numpy().ravel())) targets = np.concatenate((targets, target.data.cpu().numpy().ravel())) if step % args.report_freq == 0: logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) print(preds.shape) print(targets.shape) print('np.unique(targets):', np.unique(targets)) print('np.unique(preds): ', np.unique(preds)) from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score print(accuracy_score(targets, preds)) cr = classification_report(targets, preds, output_dict=True) a1, a2, a3 = cr['macro avg']['f1-score'], cr['macro avg']['precision'], cr[ 'macro avg']['recall'] topover = (a1 + a2 + a3) / 3 print(classification_report(targets, preds)) from sklearn.metrics import balanced_accuracy_score from sklearn.metrics import accuracy_score print(balanced_accuracy_score(targets, preds)) print(accuracy_score(targets, preds)) from sklearn.metrics import confusion_matrix matrix = confusion_matrix(targets, preds) print(matrix.diagonal() / matrix.sum(axis=1)) print(matrix) return top1.avg, objs.avg
def main(): args = parse_args() preparelog(args) if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) random.seed(args.seed) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) torch.backends.cudnn.benchmark = True torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) controller = Controller(args) controller.cuda() controller_optimizer = torch.optim.Adam( controller.parameters(), args.controller_lr, betas=(0.1, 0.999), eps=1e-3, ) train_loader, valid_loader, test_loader = get_loaders(args) total_loss = utils.AvgrageMeter() total_reward = utils.AvgrageMeter() total_entropy = utils.AvgrageMeter() base_model = build_basemodel() baseline = model_evaluate(base_model, valid_loader) controller.train() for step in range(args.total_iter): controller_optimizer.zero_grad() model_para, log_prob, entropy = controller() model = model_transform(base_model, model_para) model_finetune(model, train_loader) with torch.no_grad(): reward = model_evaluate(model, valid_loader) #if args.entropy_weight is not None: # reward += args.entropy_weight*entropy log_prob = torch.sum(log_prob) loss = log_prob * (reward - baseline) loss = loss.sum() loss.backward() controller_optimizer.step() total_loss.update(loss.item(), 1) total_reward.update(reward.item(), 1) total_entropy.update(entropy.item(), 1) if step % args.report_freq == 0: #logging.info('controller %03d %e %f %f', step, loss.item(), reward.item(), baseline.item()) logging.info('controller %03d %e %f %f', step, total_loss.avg, total_reward.avg, baseline.item())
def main(): parser = argparse.ArgumentParser() parser.add_argument('--fast-run', action='store_true', default=False) parser.add_argument('--local', action='store_true', default=False) parser.add_argument('-c', '--continue', dest='continue_path', required=False) args = parser.parse_args() cudnn.benchmark = True cudnn.enabled = True net = model.Network() # logger.info(net) net = nn.DataParallel(net).cuda() # create session sess = Session(train_spec, net=net) # worklog = WorklogLogger(os.path.join(sess.log_dir, 'worklog.txt')) criterion = utils.Denseloss(dropout=5) criterion = criterion.cuda() all_parameters = net.parameters() optimizer = torch.optim.Adam([{ 'params': all_parameters, 'weight_decay': train_spec.weight_decay, 'lr': train_spec.learning_rate }], ) adam_opt = torch.optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False) def adjust_lr(epoch, step): lr = train_spec.get_learning_rate(epoch, step) for params_group in adam_opt.param_groups: params_group['lr'] = lr return lr # Now start train clock = sess.clock clock.epoch = 0 clock.step = 0 * 1024 sess.start() # restore checkpoint checkpoint = torch.load(train_spec.imagenet_path) sess.net.load_state_dict(checkpoint['state_state'], strict=False) if args.continue_path and os.path.exists(args.continue_path): sess.load_checkpoint(args.continue_path) for ite in range(sess.clock.step): adam_opt.step() # log_output = log_rate_limited(min_interval=1)(worklog.put_line) CASI_dataset = CASIADataset('train') dataloader = DataLoader(dataset=CASI_dataset, batch_size=train_spec.minibatch_size, shuffle=False, num_workers=8) # for epoch in train_ds.epoch_generator(): for epoch in range(train_spec.stop_epoch): # if clock.epoch > train_spec.stop_epoch: # break time_epoch_start = tstart = time.time() step = 0 sess.net.train() adjust_lr(epoch, clock.step) objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() # for step in range(train_spec.minibatch_per_epoch): for step in range(train_spec.minibatch_per_epoch): minibatch = next(iter(dataloader)) # scheduler.step() adam_opt.step() # input_data = minibatch['depth'] # target = minibatch['label'] input_data = minibatch[0] target = minibatch[1] input_data = input_data.type(torch.FloatTensor) # target = torch.from_numpy(target).type(torch.LongTensor) target = target.type(torch.LongTensor) input_data = Variable(input_data).cuda() target = Variable(target).cuda(async=True) tdata = time.time() - tstart optimizer.zero_grad() dense_pred = sess.net(input_data) pred = dense_pred.mean(dim=1) loss = criterion(dense_pred, target) loss.backward() optimizer.step() cur_time = time.time() ttrain = cur_time - tstart time_passed = cur_time - time_epoch_start # time_expected = time_passed / (clock.minibatch + 1) * train_ds.minibatch_per_epoch time_expected = time_passed / (clock.minibatch + 1) * train_spec.minibatch_per_epoch eta = time_expected - time_passed prec1, = utils.accuracy(pred, target, topk=(1, )) n = input_data.size(0) objs.update(loss.item(), n) # accumulated loss top1.update(prec1.item(), n) for param_group in optimizer.param_groups: cur_lr = param_group['lr'] outputs = [ # "e:{},{}/{}".format(clock.epoch, clock.minibatch, train_ds.minibatch_per_epoch), "e:{},{}/{}".format(clock.epoch, clock.minibatch, train_spec.minibatch_per_epoch), "{:.2g} mb/s".format(1. / ttrain), ] + [ "lr:{:.6f}, loss:{:.3f}, top1_acc:{:.2f}%".format( cur_lr, objs.avg, top1.avg) ] + [ 'passed:{:.2f}'.format(time_passed), 'eta:{:.2f}'.format(eta), ] if tdata / ttrain > .05: outputs += ["dp/tot: {:.2g}".format(tdata / ttrain)] print(outputs) # log_output(' '.join(outputs)) clock.tick() tstart = time.time() # sess.save_checkpoint('epoch_{}_{}'.format(clock.epoch, clock.step)) # sess.save_checkpoint('epoch_{}'.format(clock.epoch)) clock.tock() if clock.epoch % train_spec.dump_epoch_interval == 0: sess.save_checkpoint('epoch_{}'.format(clock.epoch)) sess.save_checkpoint('latest')
def __init__(self, save_path, seed, batch_size, grad_clip, epochs, resume_iter=None, init_channels=16): args = {} args['data'] = '/data/mzhang3/randomNAS_own/data' args['epochs'] = epochs args['learning_rate'] = 0.025 args['batch_size'] = batch_size args['learning_rate_min'] = 0.001 args['momentum'] = 0.9 args['weight_decay'] = 3e-4 args['init_channels'] = init_channels args['layers'] = 8 args['drop_path_prob'] = 0.3 args['grad_clip'] = grad_clip args['train_portion'] = 0.5 args['seed'] = seed args['log_interval'] = 50 args['save'] = save_path args['gpu'] = 0 args['cuda'] = True args['cutout'] = False args['cutout_length'] = 16 args['report_freq'] = 50 args = AttrDict(args) self.args = args self.seed = seed np.random.seed(args.seed) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = False cudnn.enabled = True cudnn.deterministic = True torch.cuda.manual_seed_all(args.seed) train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=False, transform=train_transform) num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) self.train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[:split]), pin_memory=True, num_workers=0, worker_init_fn=np.random.seed(args.seed)) self.valid_queue = torch.utils.data.DataLoader( train_data, batch_size=32, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:num_train]), pin_memory=True, num_workers=0, worker_init_fn=np.random.seed(args.seed)) self.train_iter = iter(self.train_queue) self.valid_iter = iter(self.valid_queue) self.steps = 0 self.epochs = 0 self.total_loss = 0 self.start_time = time.time() criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() self.criterion = criterion model = Network(args.init_channels, 10, args.layers, self.criterion) model = model.cuda() self.model = model # try: # self.load() # logging.info('loaded previously saved weights') # except Exception as e: # print(e) logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD(self.model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) self.optimizer = optimizer self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) if resume_iter is not None: self.steps = resume_iter self.epochs = int(resume_iter / len(self.train_queue)) logging.info("Resuming from epoch %d" % self.epochs) self.objs = utils.AvgrageMeter() self.top1 = utils.AvgrageMeter() self.top5 = utils.AvgrageMeter() for i in range(self.epochs): self.scheduler.step() size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) total_params = sum(x.data.nelement() for x in model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params))
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() for step, (input, target) in enumerate(train_queue): model.train() n = input.size(0) input = Variable(input).cuda() target = Variable(target).cuda(async=True) input_pert = adv_attacks[args.adv_train](model, target, input, niters=args.niters_, epsilon=args.eps_, learning_rate=args.adv_rate_) # get a random minibatch from the search queue with replacement input_search, target_search = next(iter(valid_queue)) input_search = Variable(input_search).cuda() target_search = Variable(target_search).cuda(async=True) input_search = ifgsm(model, input_search, target_search, niters=args.niters, epsilon=args.eps, learning_rate=args.adv_rate) input_comb = torch.cat([input, input_pert]).cuda() target_comb = torch.cat([target, target]).cuda() architect.step(input_comb, target_comb, input_search, target_search, lr, optimizer, unrolled=args.unrolled) optimizer.zero_grad() logits = model(input) loss = criterion(logits, target) loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) optimizer.step() prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.data[0], n) top1.update(prec1.data[0], n) top5.update(prec5.data[0], n) if step % args.report_freq == 0: logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) del input del target return top1.avg, objs.avg
def main(): global_step = tf.train.get_or_create_global_step() images, labels = read_data(args.data) train_dataset = tf.data.Dataset.from_tensor_slices((images["train"],labels["train"])) train_dataset=train_dataset.map(_pre_process).shuffle(5000).batch(args.batch_size) train_iter=train_dataset.make_initializable_iterator() x_train,y_train=train_iter.get_next() test_dataset = tf.data.Dataset.from_tensor_slices((images["test"],labels["test"])) test_dataset=test_dataset.shuffle(5000).batch(args.batch_size) test_iter=test_dataset.make_initializable_iterator() x_test,y_test=test_iter.get_next() genotype = eval("genotypes.%s" % args.arch) train_logits,aux_logits=Model(x_train,y_train,True,args.init_channels,CLASS_NUM,args.layers,args.auxiliary,genotype) train_loss=tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_train, logits=train_logits)) w_regularization_loss = tf.add_n(utils.get_var(tf.losses.get_regularization_losses(), 'lw')[1]) train_loss+=1e4*args.weight_decay*w_regularization_loss # tf.summary.scalar('train_loss', train_loss) if args.auxiliary: loss_aux = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_train, logits=aux_logits)) train_loss += args.auxiliary_weight*loss_aux lr=tf.train.cosine_decay(args.learning_rate,global_step,50000/args.batch_size*args.epochs) accuracy=tf.reduce_mean(tf.cast(tf.nn.in_top_k(train_logits, y_train, 1), tf.float32)) test_logits,_=Model(x_test,y_test,False,args.init_channels,CLASS_NUM,args.layers,args.auxiliary,genotype) test_accuracy=tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, y_test, 1), tf.float32)) test_accuracy_top5=tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, y_test, 5), tf.float32)) tf.summary.scalar('test_accuracy_top1', test_accuracy) with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): opt=tf.train.MomentumOptimizer(lr,args.momentum) opt=opt.minimize(train_loss,global_step) merged = tf.summary.merge_all() config = tf.ConfigProto() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) config.gpu_options.allow_growth = True sess=tf.Session(config=config) writer = tf.summary.FileWriter(output_dir+TIMESTAMP,sess.graph) saver = tf.train.Saver(max_to_keep=1) sess.run(tf.global_variables_initializer()) test_batch=0 for e in range(args.epochs): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() sess.run(train_iter.initializer) while True: try: _,loss, acc,crrunt_lr,gs=sess.run([opt,train_loss,accuracy,lr,global_step]) objs.update(loss, args.batch_size) top1.update(acc, args.batch_size) if gs % args.report_freq==0: print("epochs {} steps {} currnt lr is {:.3f} loss is {} train_acc is {}".format(e,gs,crrunt_lr,objs.avg,top1.avg)) except tf.errors.OutOfRangeError: print('-'*80) print("end of an train epoch") break if e % 5 ==0: test_top1 = utils.AvgrageMeter() sess.run(test_iter.initializer) while True: try: test_batch+=1 summary,test_acc=sess.run([merged,test_accuracy]) test_top1.update(test_acc, args.batch_size) if test_batch % 100: writer.add_summary(summary, test_batch) except tf.errors.OutOfRangeError: print("******************* epochs {} test_acc is {}".format(e,test_top1.avg)) saver.save(sess, output_dir+"model",test_batch) print('-'*80) print("end of an test epoch") break