Beispiel #1
0
def valid(valid_queue, model, criterion):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    with torch.no_grad():
        model.eval()
        for step, (input, target) in enumerate(valid_queue):
            input = input.cuda()
            target = target.cuda()

            logits, _ = model(input)
            loss = criterion(logits, target)

            prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
            n = input.size(0)
            objs.update(loss.data, n)
            top1.update(prec1.data, n)
            top5.update(prec5.data, n)

            if (step + 1) % 100 == 0:
                logging.info('valid %03d %e %f %f', step + 1, objs.avg,
                             top1.avg, top5.avg)

    return top1.avg, top5.avg, objs.avg
def infer(valid_queue, model, criterion):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    model.eval()

    for step, (input, target) in enumerate(valid_queue):
        input = Variable(input, volatile=True).cuda()
        target = Variable(target, volatile=True).cuda(async=True)

        logits = model(input)
        loss = criterion(logits, target)

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        n = input.size(0)
        objs.update(loss.item(), n)
        top1.update(prec1.item(), n)
        top5.update(prec5.item(), n)

        if step % args.report_freq == 0:
            logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)

    return top1.avg, objs.avg
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr):
  objs = utils.AvgrageMeter()
  top1 = utils.AvgrageMeter()
  top5 = utils.AvgrageMeter()

  for step, (input, target) in tqdm(enumerate(train_queue)):
    model.train()
    n = input.size(0)

    input = Variable(input, requires_grad=False)
    target = Variable(target, requires_grad=False)

    # get a random minibatch from the search queue with replacement
    input_search, target_search = next(iter(valid_queue))
    input_search = Variable(input_search, requires_grad=False)
    target_search = Variable(target_search, requires_grad=False)

    architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=args.unrolled)

    optimizer.zero_grad()
    logits = model(input)
    loss = criterion(logits, target)

    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
    optimizer.step()

    prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
    objs.update(loss.data[0], n)
    top1.update(prec1.data[0], n)
    top5.update(prec5.data[0], n)

    if step % args.report_freq == 0:
      logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)

  return top1.avg, objs.avg
Beispiel #4
0
def infer(valid_queue, model, criterion):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    model.eval()

    for step, (input, target) in enumerate(valid_queue):
        input = input.cuda()
        target = target.cuda(non_blocking=True)
        with torch.no_grad():
            logits = model(input)
            loss = criterion(logits, target)

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        n = input.size(0)
        objs.update(loss.data.item(), n)
        top1.update(prec1.data.item(), n)
        top5.update(prec5.data.item(), n)

        if step % args.report_freq == 0:
            logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)

    return top1.avg, objs.avg
Beispiel #5
0
def infer(valid_queue, model, criterion, bin_op, report_freq):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    model.eval()
    bin_op.binarization()
    with torch.no_grad():
        for step, (input, target) in enumerate(valid_queue):
            input = input.cuda()
            target = target.cuda()

            logits = model(input)
            loss = criterion(logits, target)

            prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
            n = input.size(0)
            objs.update(loss.item(), n)
            top1.update(prec1.item(), n)
            top5.update(prec5.item(), n)

            # if step % report_freq == 0:
            #     print("Step: {}, Top1: {}, Top5: {}".format(step, top1.avg, top5.avg))
    bin_op.restore()
    return top1.avg, objs.avg
def infer(test_queue, model, criterion):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    model.eval()

    for step, (input, target) in enumerate(test_queue):
        input = input.to(device)
        target = target.cuda(non_blocking=True)

        logits, _ = model(input)
        loss = criterion(logits, target)

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        n = input.size(0)
        objs.update(loss.data[0], n)
        top1.update(prec1.data[0], n)
        top5.update(prec5.data[0], n)

        if step % args.report_freq == 0:
            logging.info('test %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)

    return top1.avg, objs.avg
Beispiel #7
0
def traingraft(args, epoch, train_data, device, rootmodel, graftmodel, criterion, optimizer, scheduler, supernet, choice=None):


    rootmodel.val()
    graftmodel.train()

    train_loss = 0.0
    top1 = utils.AvgrageMeter()
    train_data = tqdm(train_data)
    eps = args.epochs

    if supernet == 'supernet':
        if choice is not None:
            eps = 50

    train_data.set_description('[%s%04d/%04d %s%f]' % ('Epoch:', epoch + 1, eps, 'lr:', scheduler.get_lr()[0]))

    for step, (inputs, targets) in enumerate(train_data):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        if supernet == 'supernet':
            if choice is None:
                choice = utils.random_choice(args.num_choices, args.layers)
            outputs = model(inputs, choice)
        else:
            outputs = model(inputs)
        loss = criterion(outputs, targets)
        # if args.dataset == 'cifar10':
        loss.backward()
        # elif args.dataset == 'imagenet':
        #     with amp.scale_loss(loss, optimizer) as scaled_loss:
        #         scaled_loss.backward()
        optimizer.step()

        #model.move_to_cpu(choice)
        
        prec1, prec5 = utils.accuracy(outputs, targets, topk=(1, 5))
        n = inputs.size(0)
        top1.update(prec1.item(), n)
        train_loss += loss.item()
        postfix = {'train_loss': '%.6f' % (train_loss / (step + 1)), 'train_acc': '%.6f' % top1.avg}
        train_data.set_postfix(log=postfix)
Beispiel #8
0
def validate(args, epoch, val_data, device, model, criterion, supernet=False, choice=None):
    model.eval()
    val_loss = 0.0
    val_top1 = utils.AvgrageMeter()
    with torch.no_grad():
        for step, (inputs, targets) in enumerate(val_data):
            inputs, targets = inputs.to(device), targets.to(device)
            if supernet:
                if choice == None:
                    choice = utils.random_choice(args.num_choices, args.layers)
                outputs = model(inputs, choice)
            else:
                outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
            prec1, prec5 = utils.accuracy(outputs, targets, topk=(1, 5))
            n = inputs.size(0)
            val_top1.update(prec1.item(), n)
        print('[Val_Accuracy epoch:%d] val_loss:%f, val_acc:%f'
              % (epoch + 1, val_loss / (step + 1), val_top1.avg))
        return val_top1.avg
Beispiel #9
0
    def fitness(self, model):
        # fit = self.population.sum(axis=1).sum(axis=1)#np.random.sample(self.population_number)
        # print(fit.sum(),np.array( [[seq_creater() for i in range(self.chromosome_s)] for i in range(self.population_number)]).sum(axis=1).sum(axis=1).sum())
        total_reward = utils.AvgrageMeter()
        fit = []
        for step, dag in enumerate(self.population):
            #print(dag)
            data, target = self.dataloader.next_batch()
            n = data.size(0)

            data = data.cuda()
            target = target.cuda()
            with torch.no_grad():
                logits, aux = model(dag.tolist(), data)
                #print(dag.tolist())
                reward = utils.accuracy(logits, target)[0]

            fit.append(reward.item())
            total_reward.update(reward.item(), n)
        self.score = np.array(fit)
        print(self.score.mean())
        return np.array(fit)
def train(train_queue, model, criterion, optimizer):

    F_objs = utils.AvgrageMeter()
    F_top1 = utils.AvgrageMeter()
    F_top5 = utils.AvgrageMeter()

    H_objs = utils.AvgrageMeter()
    H_top1 = utils.AvgrageMeter()
    H_top5 = utils.AvgrageMeter()

    model.train()

    for step, (input, target) in enumerate(train_queue):

        n = input.size(0)
        input = input.cuda()
        target = target.cuda()

        optimizer.zero_grad()

        F_out, H_out = model(input)
        loss_1 = criterion(F_out, target)
        loss_2 = criterion(H_out, target)
        loss = loss_1 + loss_2

        loss.backward()

        optimizer.step()

        prec1, prec5 = utils.accuracy(F_out, target, topk=(1, 5))
        F_objs.update(loss_1.item(), n)
        F_top1.update(prec1.item(), n)
        F_top5.update(prec5.item(), n)

        prec1, prec5 = utils.accuracy(H_out, target, topk=(1, 5))
        H_objs.update(loss_2.item(), n)
        H_top1.update(prec1.item(), n)
        H_top5.update(prec5.item(), n)

        if step % args.report_freq == 0:
            logging.info('train %03d %e %f %f', step, F_objs.avg, F_top1.avg,
                         F_top5.avg)
            logging.info('train %03d %e %f %f', step, H_objs.avg, H_top1.avg,
                         H_top5.avg)

    return F_top1.avg, F_top5.avg, H_top1.avg, H_top5.avg, F_objs.avg
def infer(valid_queue, model_1, model_2, criterion):
    objs_1 = utils.AvgrageMeter()
    objs_2 = utils.AvgrageMeter()

    top1_1 = utils.AvgrageMeter()
    top5_1 = utils.AvgrageMeter()

    top1_2 = utils.AvgrageMeter()
    top5_2 = utils.AvgrageMeter()

    model_1.eval()
    model_2.eval()

    for step, (input, target) in enumerate(valid_queue):
        input = Variable(input, volatile=True).cuda()
        target = Variable(target, volatile=True).cuda(non_blocking=True)

        logits_1 = model_1(input)
        logits_2 = model_2(input)

        loss_1 = criterion(logits_1, target)
        loss_2 = criterion(logits_2, target)

        prec1_1, prec5_1 = utils.accuracy(logits_1, target, topk=(1, 5))
        prec1_2, prec5_2 = utils.accuracy(logits_2, target, topk=(1, 5))

        n = input.size(0)
        objs_1.update(loss_1.item(), n)
        objs_2.update(loss_2.item(), n)

        top1_1.update(prec1_1.item(), n)
        top5_1.update(prec5_1.item(), n)

        top1_2.update(prec1_2.item(), n)
        top5_2.update(prec5_2.item(), n)

        if step % args.report_freq == 0:
            logging.info('Valid %03d %e %e %f %f %f %f', step, objs_1.avg,
                         objs_2.avg, top1_1.avg, top5_1.avg, top1_2.avg,
                         top5_2.avg)

    return top1_1.avg, objs_1.avg, top1_2.avg, objs_2.avg
def infer(valid_queue, model, model1, criterion):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    objs1 = utils.AvgrageMeter()
    top1_1 = utils.AvgrageMeter()
    top5_1 = utils.AvgrageMeter()
    model.eval()
    model1.eval()

    with torch.no_grad():
        for step, (input, target) in enumerate(valid_queue):
            #input = input.cuda()
            #target = target.cuda(non_blocking=True)
            input = Variable(input, volatile=True).cuda()
            target = Variable(target, volatile=True).cuda(non_blocking=True)
            logits = model(input)
            loss = criterion(logits, target)
            logits1 = model1(input)
            loss1 = criterion(logits1, target)

            prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
            n = input.size(0)
            objs.update(loss.item(), n)
            top1.update(prec1.item(), n)
            top5.update(prec5.item(), n)
            prec1, prec5 = utils.accuracy(logits1, target, topk=(1, 5))
            n = input.size(0)
            objs1.update(loss1.item(), n)
            top1_1.update(prec1.item(), n)
            top5_1.update(prec5.item(), n)

            if step % args.report_freq == 0:
                logging.info('valid 1st %03d %e %f %f', step, objs.avg,
                             top1.avg, top5.avg)
                logging.info('valid 2nd %03d %e %f %f', step, objs1.avg,
                             top1_1.avg, top5_1.avg)

    return top1.avg, objs.avg, top1_1.avg, objs1.avg
def infer(valid_queue, model, criterion):

    F_objs = utils.AvgrageMeter()
    F_top1 = utils.AvgrageMeter()
    F_top5 = utils.AvgrageMeter()

    H_objs = utils.AvgrageMeter()
    H_top1 = utils.AvgrageMeter()
    H_top5 = utils.AvgrageMeter()

    model.eval()

    with torch.no_grad():
        for step, (input, target) in enumerate(valid_queue):
            input = input.cuda()
            target = target.cuda()

            F_out, H_out = model(input)
            loss_1 = criterion(F_out, target)
            loss_2 = criterion(H_out, target)

            n = input.size(0)
            prec1, prec5 = utils.accuracy(F_out, target, topk=(1, 5))
            F_objs.update(loss_1.item(), n)
            F_top1.update(prec1.item(), n)
            F_top5.update(prec5.item(), n)

            prec1, prec5 = utils.accuracy(H_out, target, topk=(1, 5))
            H_objs.update(loss_2.item(), n)
            H_top1.update(prec1.item(), n)
            H_top5.update(prec5.item(), n)

            if step % args.report_freq == 0:
                logging.info('valid %03d %e %f %f', step, F_objs.avg,
                             F_top1.avg, F_top5.avg)
                logging.info('valid %03d %e %f %f', step, H_objs.avg,
                             H_top1.avg, H_top5.avg)

    return F_top1.avg, F_top5.avg, H_top1.avg, H_top5.avg, F_objs.avg
Beispiel #14
0
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, low_flops, high_flops, backbone, tau):
  objs = utils.AvgrageMeter()
  top1 = utils.AvgrageMeter()
  top5 = utils.AvgrageMeter()

  hardware_pool = [i for i in range(low_flops, high_flops, 5)] 
  hardware_index = 0

  for step, (input, target) in enumerate(train_queue):
    model.train()
    n = input.size(0)

    input = Variable(input, requires_grad=False).cuda()
    target = Variable(target, requires_grad=False).cuda(async=True)

    # get a random minibatch from the search queue with replacement
    input_search, target_search = next(iter(valid_queue))
    input_search = Variable(input_search, requires_grad=False).cuda()
    target_search = Variable(target_search, requires_grad=False).cuda(async=True)

    target_hc = torch.tensor(hardware_pool[hardware_index]+3*(random.random()-0.5), dtype=torch.float32).view(-1, 1)
    target_hc = target_hc.cuda()
    logger.info("Target hc : {}".foramt(target_hc.item()))

    backbone = backbone.cuda()
    normalalize_target_hc = min_max_normalize(high_flops, low_flops, target_hc)
    arch_param = generator(backbone, normalalize_target_hc)
    arch_param = arch_param.reshape(-1, arch_param.size(-1))
    alphas_normal = F.gumbel_softmax(arch_param[0], dim=-1, tau)
    alphas_reduce = F.gumbel_softmax(arch_param[1], dim=-1, tau)

    gen_hc = lookup_table.get_model_macs(alphas_normal, alphas_reduce)
    logger.info("Generator hc : {}".format(gen_hc))

    hc_loss = cal_hc_loss(gen_hc.cuda(), target_hc.item(), ALPHA, LOSS_PENALTY)

    hardware_index += 1
    if hardware_index == len(hardware_pool):
        hardware_index = 0
        random.shuffle(hardware_pool)

    #architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=args.unrolled)
    self.g_optimizer.zero_grad()
    g_loss = self.model._loss(input_valid, target_valid)
    loss = g_loss + hc_loss
    g_loss.backward()
    self.g_optimizer.step()

    # =========================================================================

    optimizer.zero_grad()
    logits = model(input)
    loss = criterion(logits, target)

    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
    optimizer.step()

    prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
    objs.update(loss.data[0], n)
    top1.update(prec1.data[0], n)
    top5.update(prec5.data[0], n)

    if step % args.report_freq == 0:
      logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)

  return top1.avg, objs.avg
def train(train_queue, valid_queue, external_queue, model, model1, architect,
          criterion, optimizer, optimizer1, lr, lr1):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()

    objs1 = utils.AvgrageMeter()
    top1_1 = utils.AvgrageMeter()
    top5_1 = utils.AvgrageMeter()

    valid_queue_iter = iter(valid_queue)
    external_queue_iter = iter(external_queue)
    for step, (input, target) in enumerate(train_queue):
        model.train()
        model1.train()
        n = input.size(0)
        input = input.cuda()
        target = target.cuda(non_blocking=True)

        # get a random minibatch from the search queue with replacement
        try:
            input_search, target_search = next(valid_queue_iter)
        except:
            valid_queue_iter = iter(valid_queue)
            input_search, target_search = next(valid_queue_iter)
        try:
            input_external, target_external = next(external_queue_iter)
        except:
            external_queue_iter = iter(external_queue)
            input_external, target_external = next(external_queue_iter)

        # input_external, target_external = next(iter(external_queue))
        # input_search, target_search = next(iter(valid_queue))
        input_search = input_search.cuda()
        target_search = target_search.cuda(non_blocking=True)

        input_external = input_external.cuda()
        target_external = target_external.cuda(non_blocking=True)
        # import ipdb; ipdb.set_trace()
        architect.step(input,
                       target,
                       input_external,
                       target_external,
                       input_search,
                       target_search,
                       lr,
                       lr1,
                       optimizer,
                       optimizer1,
                       unrolled=args.unrolled)

        optimizer.zero_grad()
        optimizer1.zero_grad()
        logits = model(input)
        logits1 = model1(input)
        loss = criterion(logits, target)
        loss1 = criterion(logits1, target)
        external_out = model(input_external)
        external_out1 = model1(input_external)
        if args.debug:
            with torch.no_grad():
                softlabel_other = F.softmax(external_out, 1)
            softlabel_other = softlabel_other.detach()
        else:
            softlabel_other = F.softmax(external_out, 1)

        loss_soft = softXEnt(external_out1, softlabel_other)
        if args.debug:
            with torch.no_grad():
                softlabel_other1 = F.softmax(external_out1, 1)
            softlabel_other1 = softlabel_other1.detach()
        else:
            softlabel_other1 = F.softmax(external_out1, 1)

        loss_soft1 = softXEnt(external_out, softlabel_other1)
        loss_all = loss + loss1 + args.weight_lambda * (loss_soft1 + loss_soft)

        loss_all.backward()

        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        nn.utils.clip_grad_norm_(model1.parameters(), args.grad_clip)
        optimizer.step()
        optimizer1.step()

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        objs.update(loss.item(), n)
        top1.update(prec1.item(), n)
        top5.update(prec5.item(), n)

        prec1, prec5 = utils.accuracy(logits1, target, topk=(1, 5))
        objs1.update(loss1.item(), n)
        top1_1.update(prec1.item(), n)
        top5_1.update(prec5.item(), n)

        if step % args.report_freq == 0:
            logging.info('train 1st %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)
            logging.info('train 2nd %03d %e %f %f', step, objs1.avg,
                         top1_1.avg, top5_1.avg)

    return top1.avg, objs.avg, top1_1.avg, objs1.avg
Beispiel #16
0
def train(train_queue,
          valid_queue,
          model,
          network_params,
          criterion,
          optimizer,
          optimizer_a,
          lr,
          train_arch=True):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    global baseline
    for step, (input, target) in enumerate(train_queue):
        model.train()
        n = input.size(0)
        input = input.cuda()
        target = target.cuda(non_blocking=True)
        # if step % 10 ==0:  # 每10个batch ,进行一次RL , 一次 采10次网络
        if 1:  # 每10个batch ,进行一次RL , 一次 采10次网络
            if train_arch:
                # In the original implementation of DARTS, it is input_search, target_search = next(iter(valid_queue), which slows down
                # the training when using PyTorch 0.4 and above.
                try:
                    input_search, target_search = next(valid_queue_iter)
                except:
                    valid_queue_iter = iter(valid_queue)
                    input_search, target_search = next(valid_queue_iter)
                input_search = input_search.cuda()
                target_search = target_search.cuda(non_blocking=True)
                normal_grad_buffer = []
                reduce_grad_buffer = []
                reward_buffer = []
                for batch_idx in range(rl_batch_size):  # 多采集几个网络,测试
                    # sample the submodel
                    get_cur_model(model)
                    # cur_sub_model.cuda()
                    # cur_sub_model.drop_path_prob = 0
                    # validat the sub_model
                    with torch.no_grad():
                        # logits, _ = cur_sub_model(input_search)
                        logits = model(input_search)
                        prec1, _ = utils.accuracy(logits,
                                                  target_search,
                                                  topk=(1, 5))
                    if model.module._arch_parameters[0].grad is not None:
                        model.module._arch_parameters[0].grad.data.zero_()
                    if model.module._arch_parameters[1].grad is not None:
                        model.module._arch_parameters[1].grad.data.zero_()
                    obj_term = 0
                    for i in range(14):
                        obj_term = obj_term + model.module.normal_log_prob[i]
                        obj_term = obj_term + model.module.reduce_log_prob[i]
                    loss_term = -obj_term
                    # backward
                    loss_term.backward()
                    # take out gradient dict
                    normal_grad_list = []
                    reduce_grad_list = []
                    normal_grad_buffer.append(
                        model.module._arch_parameters[0].grad.data.clone())
                    reduce_grad_buffer.append(
                        model.module._arch_parameters[1].grad.data.clone())
                    reward_buffer.append(prec1)
                avg_reward = sum(reward_buffer) / rl_batch_size
                if baseline == 0:
                    baseline = avg_reward
                else:
                    baseline += baseline_decay_weight * (avg_reward - baseline)

                # for idx in range(14):
                model.module._arch_parameters[0].grad.data.zero_()
                model.module._arch_parameters[1].grad.data.zero_()
                for j in range(rl_batch_size):
                    model.module._arch_parameters[0].grad.data += (
                        reward_buffer[j] - baseline) * normal_grad_buffer[j]
                    model.module._arch_parameters[1].grad.data += (
                        reward_buffer[j] - baseline) * reduce_grad_buffer[j]
                model.module._arch_parameters[0].grad.data /= rl_batch_size
                model.module._arch_parameters[1].grad.data /= rl_batch_size
                # apply gradients
                optimizer_a.step()
                logging.info(
                    'REINFORCE [step %d]\t\tMean Reward %.4f\tBaseline %d',
                    step, avg_reward, baseline)
                model.module.restore_super_net()
                # print(model.module._arch_parameters[0])
                # print(model.module._arch_parameters[1])
        if not train_arch:
            # if 0:
            optimizer.zero_grad()
            logits = model(input)
            loss = criterion(logits, target)

            loss.backward()
            nn.utils.clip_grad_norm_(network_params, args.grad_clip)
            optimizer.step()

            prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
            objs.update(loss.data.item(), n)
            top1.update(prec1.data.item(), n)
            top5.update(prec5.data.item(), n)

            if step % args.report_freq == 0:
                logging.info('TRAIN Step: %03d Objs: %e R1: %f R5: %f', step,
                             objs.avg, top1.avg, top5.avg)
        else:
            top1.avg = 0
            objs.avg = 0
    return top1.avg, objs.avg
Beispiel #17
0
def train(train_queue, model, cnn_optimizer, grad_scalar, global_step,
          warmup_iters, writer, logging):
    alpha_i = utils.kl_balancer_coeff(num_scales=model.num_latent_scales,
                                      groups_per_scale=model.groups_per_scale,
                                      fun='square')
    nelbo = utils.AvgrageMeter()
    model.train()
    for step, x in enumerate(train_queue):
        x = x[0] if len(x) > 1 else x
        x = x.half().cuda()

        # change bit length
        x = utils.pre_process(x, args.num_x_bits)

        # warm-up lr
        if global_step < warmup_iters:
            lr = args.learning_rate * float(global_step) / warmup_iters
            for param_group in cnn_optimizer.param_groups:
                param_group['lr'] = lr

        # sync parameters, it may not be necessary
        if step % 100 == 0:
            utils.average_params(model.parameters(), args.distributed)

        cnn_optimizer.zero_grad()
        with autocast():
            logits, log_q, log_p, kl_all, kl_diag = model(x)

            output = model.decoder_output(logits)
            kl_coeff = utils.kl_coeff(
                global_step, args.kl_anneal_portion * args.num_total_iter,
                args.kl_const_portion * args.num_total_iter,
                args.kl_const_coeff)

            recon_loss = utils.reconstruction_loss(output,
                                                   x,
                                                   crop=model.crop_output)
            balanced_kl, kl_coeffs, kl_vals = utils.kl_balancer(
                kl_all, kl_coeff, kl_balance=True, alpha_i=alpha_i)

            nelbo_batch = recon_loss + balanced_kl
            loss = torch.mean(nelbo_batch)
            norm_loss = model.spectral_norm_parallel()
            bn_loss = model.batchnorm_loss()
            # get spectral regularization coefficient (lambda)
            if args.weight_decay_norm_anneal:
                assert args.weight_decay_norm_init > 0 and args.weight_decay_norm > 0, 'init and final wdn should be positive.'
                wdn_coeff = (1. - kl_coeff) * np.log(
                    args.weight_decay_norm_init) + kl_coeff * np.log(
                        args.weight_decay_norm)
                wdn_coeff = np.exp(wdn_coeff)
            else:
                wdn_coeff = args.weight_decay_norm

            loss += norm_loss * wdn_coeff + bn_loss * wdn_coeff

        grad_scalar.scale(loss).backward()
        utils.average_gradients(model.parameters(), args.distributed)
        grad_scalar.step(cnn_optimizer)
        grad_scalar.update()
        nelbo.update(loss.data, 1)

        if (global_step + 1) % 100 == 0:
            if (global_step + 1) % 1000 == 0:  # reduced frequency
                n = int(np.floor(np.sqrt(x.size(0))))
                x_img = x[:n * n]
                output_img = output.mean if isinstance(
                    output, torch.distributions.bernoulli.Bernoulli
                ) else output.sample()
                output_img = output_img[:n * n]
                x_tiled = utils.tile_image(x_img, n)
                output_tiled = utils.tile_image(output_img, n)
                in_out_tiled = torch.cat((x_tiled, output_tiled), dim=2)
                writer.add_image('reconstruction', in_out_tiled, global_step)

            # norm
            writer.add_scalar('train/norm_loss', norm_loss, global_step)
            writer.add_scalar('train/bn_loss', bn_loss, global_step)
            writer.add_scalar('train/norm_coeff', wdn_coeff, global_step)

            utils.average_tensor(nelbo.avg, args.distributed)
            logging.info('train %d %f', global_step, nelbo.avg)
            writer.add_scalar('train/nelbo_avg', nelbo.avg, global_step)
            writer.add_scalar(
                'train/lr',
                cnn_optimizer.state_dict()['param_groups'][0]['lr'],
                global_step)
            writer.add_scalar('train/nelbo_iter', loss, global_step)
            writer.add_scalar('train/kl_iter', torch.mean(sum(kl_all)),
                              global_step)
            writer.add_scalar(
                'train/recon_iter',
                torch.mean(
                    utils.reconstruction_loss(output,
                                              x,
                                              crop=model.crop_output)),
                global_step)
            writer.add_scalar('kl_coeff/coeff', kl_coeff, global_step)
            total_active = 0
            for i, kl_diag_i in enumerate(kl_diag):
                utils.average_tensor(kl_diag_i, args.distributed)
                num_active = torch.sum(kl_diag_i > 0.1).detach()
                total_active += num_active

                # kl_ceoff
                writer.add_scalar('kl/active_%d' % i, num_active, global_step)
                writer.add_scalar('kl_coeff/layer_%d' % i, kl_coeffs[i],
                                  global_step)
                writer.add_scalar('kl_vals/layer_%d' % i, kl_vals[i],
                                  global_step)
            writer.add_scalar('kl/total_active', total_active, global_step)

        global_step += 1

    utils.average_tensor(nelbo.avg, args.distributed)
    return nelbo.avg, global_step
def train_and_evaluate_top_on_imagenet(archs, train_queue, valid_queue):
    res = []
    train_criterion = nn.CrossEntropyLoss().cuda()
    eval_criterion = nn.CrossEntropyLoss().cuda()
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    for i, arch in enumerate(archs):
        objs.reset()
        top1.reset()
        top5.reset()
        logging.info('Train and evaluate the {} arch'.format(i + 1))
        model = NASNetworkImageNet(args, 1000, args.child_layers,
                                   args.child_nodes, args.child_channels, 1.0,
                                   1.0, True, args.steps, arch)
        model = model.cuda()
        model.train()
        optimizer = torch.optim.SGD(
            model.parameters(),
            args.child_lr,
            momentum=0.9,
            weight_decay=args.child_l2_reg,
        )
        for step, (input, target) in enumerate(train_queue):
            input = input.cuda().requires_grad_()
            target = target.cuda()

            optimizer.zero_grad()
            # sample an arch to train
            logits, aux_logits = model(input, step)
            loss = train_criterion(logits, target)
            if aux_logits is not None:
                aux_loss = train_criterion(aux_logits, target)
                loss += 0.4 * aux_loss
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), args.child_grad_bound)
            optimizer.step()

            prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
            n = input.size(0)
            objs.update(loss.data, n)
            top1.update(prec1.data, n)
            top5.update(prec5.data, n)

            if (step + 1) % 100 == 0:
                logging.info('Train %03d loss %e top1 %f top5 %f', step + 1,
                             objs.avg, top1.avg, top5.avg)
            if step + 1 == 500:
                break

        objs.reset()
        top1.reset()
        top5.reset()
        with torch.no_grad():
            model.eval()
            for step, (input, target) in enumerate(valid_queue):
                input = input.cuda()
                target = target.cuda()

                logits, _ = model(input)
                loss = eval_criterion(logits, target)

                prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
                n = input.size(0)
                objs.update(loss.data, n)
                top1.update(prec1.data, n)
                top5.update(prec5.data, n)

                if (step + 1) % 100 == 0:
                    logging.info('valid %03d %e %f %f', step + 1, objs.avg,
                                 top1.avg, top5.avg)
        res.append(top1.avg)
    return res
Beispiel #19
0
def train(train_queue,
          valid_queue,
          model,
          architect,
          criterion,
          optimizer,
          lr,
          epoch,
          grad_clip,
          report_lines,
          unrolled,
          criterion_weight=1.0,
          l1_weight=-1,
          l2_weight=-1):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()

    criterion_loss = torch.zeros(1)
    l1_loss = torch.zeros(1)
    l2_loss = torch.zeros(1)
    for step, (input, target) in enumerate(train_queue):
        model.train()
        n = input.size(0)
        input = input.cuda()
        target = target.cuda(non_blocking=True)

        # get a random minibatch from the search queue with replacement
        # input_search, target_search = next(iter(valid_queue))
        try:
            input_search, target_search = next(valid_queue_iter)
        except:
            valid_queue_iter = iter(valid_queue)
            input_search, target_search = next(valid_queue_iter)
        input_search = input_search.cuda()
        target_search = target_search.cuda(non_blocking=True)

        if epoch >= 15:
            architect.step(input,
                           target,
                           input_search,
                           target_search,
                           lr,
                           optimizer,
                           unrolled=unrolled)

        optimizer.zero_grad()
        logits = model(input)
        criterion_loss = criterion(logits, target)
        loss = criterion_weight * criterion_loss
        if l1_weight >= 0:
            l1_loss = param_loss(model, nn.L1Loss(reduction='sum'))
            loss += l1_weight * l1_loss
        if l2_weight >= 0:
            l2_loss = param_loss(model, nn.MSELoss(reduction='sum'))
            loss += l2_weight * l2_loss

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        objs.update(loss.data.item(), n)
        top1.update(prec1.data.item(), n)
        top5.update(prec5.data.item(), n)

        if step % (len(train_queue) // report_lines) == 0:
            log.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)

    return top1.avg, objs.avg, l1_loss, l2_loss, criterion_loss
Beispiel #20
0
    def train(self, epoch, logging):
        objs = utils.AvgrageMeter()
        top1 = utils.AvgrageMeter()
        top5 = utils.AvgrageMeter()
        grad = utils.AvgrageMeter()

        normal_resource_gradient = 0
        reduce_resource_gradient = 0
        normal_loss_gradient = 0
        reduce_loss_gradient = 0
        normal_total_gradient = 0
        reduce_total_gradient = 0

        loss_alpha = None

        count = 0
        for step, (input, target) in enumerate(self.train_queue):
            if self.args.alternate_update:
                if step % 2 == 0:
                    self.update_theta = True
                    self.update_alpha = False
                else:
                    self.update_theta = False
                    self.update_alpha = True

            n = input.size(0)
            input = input.to(self.device)
            target = target.to(self.device, non_blocking=True)
            if self.args.snas:
                logits, logits_aux, penalty, op_normal, op_reduce = self.model(
                    input)
                error_loss = self.criterion(logits, target)
                if self.args.auxiliary:
                    loss_aux = self.criterion(logits_aux, target)
                    error_loss += self.args.auxiliary_weight * loss_aux

            if self.args.dsnas:
                logits, error_loss, loss_alpha, penalty = self.model(
                    input, target, self.criterion)

            num_normal = self.model.num_normal
            num_reduce = self.model.num_reduce
            normal_arch_entropy = self.model._arch_entropy(
                self.model.normal_log_alpha)
            reduce_arch_entropy = self.model._arch_entropy(
                self.model.reduce_log_alpha)

            if self.args.resource_efficient:
                if self.args.method == 'policy_gradient':
                    resource_penalty = (penalty[2]) / 6 + self.args.ratio * (
                        penalty[7]) / 2
                    log_resource_penalty = (
                        penalty[35]) / 6 + self.args.ratio * (penalty[36]) / 2
                elif self.args.method == 'reparametrization':
                    resource_penalty = (penalty[26]) / 6 + self.args.ratio * (
                        penalty[25]) / 2
                    log_resource_penalty = (
                        penalty[37]) / 6 + self.args.ratio * (penalty[38]) / 2
                elif self.args.method == 'discrete':
                    resource_penalty = (penalty[28]) / 6 + self.args.ratio * (
                        penalty[27]) / 2
                    log_resource_penalty = (
                        penalty[39]) / 6 + self.args.ratio * (penalty[40]) / 2
                elif self.args.method == 'none':
                    # TODo
                    resource_penalty = torch.zeros(1).cuda()
                    log_resource_penalty = torch.zeros(1).cuda()
                else:
                    logging.info(
                        "wrongly input of method, please re-enter --method from 'policy_gradient', 'discrete', "
                        "'reparametrization', 'none'")
                    sys.exit(1)
            else:
                resource_penalty = torch.zeros(1).cuda()
                log_resource_penalty = torch.zeros(1).cuda()

            if self.args.log_penalty:
                resource_loss = self.model._resource_lambda * log_resource_penalty
            else:
                resource_loss = self.model._resource_lambda * resource_penalty

            if self.args.loss:
                if self.args.snas:
                    loss = resource_loss.clone() + error_loss.clone()
                elif self.args.dsnas:
                    loss = resource_loss.clone()
                else:
                    loss = resource_loss.clone() + -child_coef * (
                        torch.log(normal_one_hot_prob) +
                        torch.log(reduce_one_hot_prob)).sum()
            else:
                if self.args.snas or self.args.dsnas:
                    loss = error_loss.clone()

            if self.args.distributed:
                loss.div_(self.world_size)
                error_loss.div_(self.world_size)
                resource_loss.div_(self.world_size)
                if self.args.dsnas:
                    loss_alpha.div_(self.world_size)

            # logging gradient
            count += 1
            if self.args.resource_efficient:
                self.optimizer.zero_grad()
                self.arch_optimizer.zero_grad()
                resource_loss.backward(retain_graph=True)
                if not self.args.random_sample:
                    normal_resource_gradient += self.model.normal_log_alpha.grad
                    reduce_resource_gradient += self.model.reduce_log_alpha.grad
            if self.args.snas:
                self.optimizer.zero_grad()
                self.arch_optimizer.zero_grad()
                error_loss.backward(retain_graph=True)
                if not self.args.random_sample:
                    normal_loss_gradient += self.model.normal_log_alpha.grad
                    reduce_loss_gradient += self.model.reduce_log_alpha.grad
                self.optimizer.zero_grad()
                self.arch_optimizer.zero_grad()

            if self.args.snas or not self.args.random_sample and not self.args.dsnas:
                loss.backward()
            if not self.args.random_sample:
                normal_total_gradient += self.model.normal_log_alpha.grad
                reduce_total_gradient += self.model.reduce_log_alpha.grad

            if self.args.distributed:
                reduce_tensorgradients(self.model.parameters(), sync=True)
                nn.utils.clip_grad_norm_([
                    param for name, param in self.model.named_parameters() if
                    name != 'normal_log_alpha' and name != 'reduce_log_alpha'
                ], self.args.grad_clip)
                arch_grad_norm = nn.utils.clip_grad_norm_([
                    param for name, param in self.model.named_parameters()
                    if name == 'normal_log_alpha' or name == 'reduce_log_alpha'
                ], 10.)
            else:
                nn.utils.clip_grad_norm_(self.model.parameters(),
                                         self.args.grad_clip)
                arch_grad_norm = nn.utils.clip_grad_norm_(
                    self.model.arch_parameters(), 10.)

            grad.update(arch_grad_norm)
            if not self.args.fix_weight and self.update_theta:
                self.optimizer.step()
            self.optimizer.zero_grad()
            if not self.args.random_sample and self.update_alpha:
                self.arch_optimizer.step()
            self.arch_optimizer.zero_grad()

            if self.rank == 0:
                self.logger.add_scalar(
                    "iter_train_loss", error_loss,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "normal_arch_entropy", normal_arch_entropy,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "reduce_arch_entropy", reduce_arch_entropy,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "total_arch_entropy",
                    normal_arch_entropy + reduce_arch_entropy,
                    step + len(self.train_queue.dataset) * epoch)
                if self.args.dsnas:
                    #reward_normal_edge
                    self.logger.add_scalar(
                        "reward_normal_edge_0",
                        self.model.normal_edge_reward[0],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_1",
                        self.model.normal_edge_reward[1],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_2",
                        self.model.normal_edge_reward[2],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_3",
                        self.model.normal_edge_reward[3],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_4",
                        self.model.normal_edge_reward[4],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_5",
                        self.model.normal_edge_reward[5],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_6",
                        self.model.normal_edge_reward[6],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_7",
                        self.model.normal_edge_reward[7],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_8",
                        self.model.normal_edge_reward[8],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_9",
                        self.model.normal_edge_reward[9],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_10",
                        self.model.normal_edge_reward[10],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_11",
                        self.model.normal_edge_reward[11],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_12",
                        self.model.normal_edge_reward[12],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_normal_edge_13",
                        self.model.normal_edge_reward[13],
                        step + len(self.train_queue.dataset) * epoch)
                    #reward_reduce_edge
                    self.logger.add_scalar(
                        "reward_reduce_edge_0",
                        self.model.reduce_edge_reward[0],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_1",
                        self.model.reduce_edge_reward[1],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_2",
                        self.model.reduce_edge_reward[2],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_3",
                        self.model.reduce_edge_reward[3],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_4",
                        self.model.reduce_edge_reward[4],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_5",
                        self.model.reduce_edge_reward[5],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_6",
                        self.model.reduce_edge_reward[6],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_7",
                        self.model.reduce_edge_reward[7],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_8",
                        self.model.reduce_edge_reward[8],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_9",
                        self.model.reduce_edge_reward[9],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_10",
                        self.model.reduce_edge_reward[10],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_11",
                        self.model.reduce_edge_reward[11],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_12",
                        self.model.reduce_edge_reward[12],
                        step + len(self.train_queue.dataset) * epoch)
                    self.logger.add_scalar(
                        "reward_reduce_edge_13",
                        self.model.reduce_edge_reward[13],
                        step + len(self.train_queue.dataset) * epoch)
                #policy size
                self.logger.add_scalar(
                    "iter_normal_size_policy", penalty[2] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_size_policy", penalty[7] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                # baseline: discrete_probability
                self.logger.add_scalar(
                    "iter_normal_size_baseline", penalty[3] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_flops_baseline", penalty[5] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_mac_baseline", penalty[6] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_size_baseline", penalty[8] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_flops_baseline", penalty[9] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_mac_baseline", penalty[10] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                # R - median(R)
                self.logger.add_scalar(
                    "iter_normal_size-avg", penalty[60] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_flops-avg", penalty[61] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_mac-avg", penalty[62] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_size-avg", penalty[63] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_flops-avg", penalty[64] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_mac-avg", penalty[65] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                # lnR - ln(median)
                self.logger.add_scalar(
                    "iter_normal_ln_size-ln_avg", penalty[66] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_ln_flops-ln_avg", penalty[67] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_ln_mac-ln_avg", penalty[68] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_ln_size-ln_avg", penalty[69] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_ln_flops-ln_avg", penalty[70] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_ln_mac-ln_avg", penalty[71] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                '''
                self.logger.add_scalar("iter_normal_size_normalized", penalty[17] / 6, step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar("iter_normal_flops_normalized", penalty[18] / 6, step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar("iter_normal_mac_normalized", penalty[19] / 6, step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar("iter_reduce_size_normalized", penalty[20] / 2, step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar("iter_reduce_flops_normalized", penalty[21] / 2, step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar("iter_reduce_mac_normalized", penalty[22] / 2, step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar("iter_normal_penalty_normalized", penalty[23] / 6,
                                  step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar("iter_reduce_penalty_normalized", penalty[24] / 2,
                                  step + len(self.train_queue.dataset) * epoch)
                '''
                # Monte_Carlo(R_i)
                self.logger.add_scalar(
                    "iter_normal_size_mc", penalty[29] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_flops_mc", penalty[30] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_mac_mc", penalty[31] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_size_mc", penalty[32] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_flops_mc", penalty[33] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_mac_mc", penalty[34] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                # log(|R_i|)
                self.logger.add_scalar(
                    "iter_normal_log_size", penalty[41] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_log_flops", penalty[42] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_log_mac", penalty[43] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_log_size", penalty[44] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_log_flops", penalty[45] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_log_mac", penalty[46] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                # log(P)R_i
                self.logger.add_scalar(
                    "iter_normal_logP_size", penalty[47] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_logP_flops", penalty[48] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_logP_mac", penalty[49] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_logP_size", penalty[50] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_logP_flops", penalty[51] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_logP_mac", penalty[52] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                # log(P)log(R_i)
                self.logger.add_scalar(
                    "iter_normal_logP_log_size", penalty[53] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_logP_log_flops", penalty[54] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_normal_logP_log_mac", penalty[55] / num_normal,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_logP_log_size", penalty[56] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_logP_log_flops", penalty[57] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)
                self.logger.add_scalar(
                    "iter_reduce_logP_log_mac", penalty[58] / num_reduce,
                    step + len(self.train_queue.dataset) * epoch)

            prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))

            if self.args.distributed:
                loss = loss.detach()
                dist.all_reduce(error_loss)
                dist.all_reduce(prec1)
                dist.all_reduce(prec5)
                prec1.div_(self.world_size)
                prec5.div_(self.world_size)
                #dist_util.all_reduce([loss, prec1, prec5], 'mean')
            objs.update(error_loss.item(), n)
            top1.update(prec1.item(), n)
            top5.update(prec5.item(), n)

            if step % self.args.report_freq == 0 and self.rank == 0:
                logging.info('train %03d %e %f %f', step, objs.avg, top1.avg,
                             top5.avg)
                self.logger.add_scalar(
                    "iter_train_top1_acc", top1.avg,
                    step + len(self.train_queue.dataset) * epoch)

        if self.rank == 0:
            logging.info('-------resource gradient--------')
            logging.info(normal_resource_gradient / count)
            logging.info(reduce_resource_gradient / count)
            logging.info('-------loss gradient--------')
            logging.info(normal_loss_gradient / count)
            logging.info(reduce_loss_gradient / count)
            logging.info('-------total gradient--------')
            logging.info(normal_total_gradient / count)
            logging.info(reduce_total_gradient / count)

        return top1.avg, loss, error_loss, loss_alpha
Beispiel #21
0
def train(train_queue, model, margin, criterion, optimizer, epoch):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    batch_time = utils.AvgrageMeter()
    model.train()

    for step, (input, target) in enumerate(train_queue):
        target = target.cuda(non_blocking=True)
        input = input.cuda(non_blocking=True)
        b_start = time.time()
        optimizer.zero_grad()
        logits = model(input)
        thetas = margin(logits, target)
        loss = criterion(thetas, target)
        if args.auxiliary:
            loss_aux = criterion(logits_aux, target)
            loss += args.auxiliary_weight * loss_aux

        loss.backward()
        # with amp.scale_loss(loss, optimizer) as scaled_loss:
        #     scaled_loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        optimizer.step()
        batch_time.update(time.time() - b_start)
        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        n = input.size(0)
        objs.update(loss.data.item(), n)
        top1.update(prec1.data.item(), n)
        top5.update(prec5.data.item(), n)

        if step % args.report_freq == 0:
            end_time = time.time()
            if step == 0:
                duration = 0
                start_time = time.time()
            else:
                duration = end_time - start_time
                start_time = time.time()
            logging.info(
                'TRAIN Step: %03d Objs: %e R1: %f R5: %f Duration: %ds BTime: %.3fs',
                step, objs.avg, top1.avg, top5.avg, duration, batch_time.avg)
        if step % 5000 == 0:
            valid_acc_top1 = infer(data_loaders, dataset, model, margin, epoch)
            global best_acc_top1
            is_best = False
            if valid_acc_top1 > best_acc_top1:
                best_acc_top1 = valid_acc_top1
                is_best = True

            state = {
                'epoch': epoch + 1,
                'model': model.module.state_dict(),
                'margin': margin.module.state_dict(),
                'best_acc_top1': best_acc_top1,
                'optimizer': optimizer.state_dict(),
            }

            if is_best:
                filename = os.path.join('./', 'best_model.pth.tar')
                torch.save(state, filename)
                torch.save(model.state_dict(), './model.pt')
                torch.save(margin.state_dict(), './margin.pt')

                filename = os.path.join('./', 'checkpoint.pth.tar')
                torch.save(state, filename)
            else:
                filename = os.path.join('./', 'checkpoint.pth.tar')
                torch.save(state, filename)

    return top1.avg, objs.avg
Beispiel #22
0
def train(train_queue, valid_queue, model, architect, criterion, optimizer,
          lr):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()

    model.sample()
    train_queue.dataset.weights_index = model.ops_weights_b
    train_queue.dataset.probabilities_index = model.probabilities_b
    for step, (input, target) in enumerate(train_queue):
        model.train()
        model.set_augmenting(True)
        n = input[0].size(0)

        # input = Variable(input, requires_grad=False).cuda()
        # target = Variable(target, requires_grad=False).cuda(async=True)
        # input = [Variable(img, requires_grad=False).cuda() for img in input]
        input = Variable(input, requires_grad=False).cuda()
        target = Variable(target, requires_grad=False).cuda(non_blocking=True)
        # trans_images_list = [ [Variable(trans_image, requires_grad=False).cuda()
        #                         for trans_image in trans_images]
        #                       for trans_images in trans_images_list]

        # get a random minibatch from the search queue with replacement
        input_search, target_search = next(iter(valid_queue))
        input_search = Variable(input_search, requires_grad=False).cuda()
        target_search = Variable(target_search,
                                 requires_grad=False).cuda(non_blocking=True)
        # input_search = Variable(input_search, requires_grad=False).cuda()
        # target_search = Variable(target_search, requires_grad=False).cuda(async=True)

        architect.step(input,
                       target,
                       input_search,
                       target_search,
                       lr,
                       optimizer,
                       unrolled=args.unrolled)

        optimizer.zero_grad()
        logits = model(input)
        loss = criterion(logits, target)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        optimizer.step()
        prec1, prec5 = utils.accuracy(logits.detach(),
                                      target.detach(),
                                      topk=(1, 5))
        # objs.update(loss.data[0], n)
        # top1.update(prec1.data[0], n)
        # top5.update(prec5.data[0], n)
        objs.update(loss.detach().item(), n)
        top1.update(prec1.detach().item(), n)
        top5.update(prec5.detach().item(), n)

        if step % args.report_freq == 0:
            logging.info('train %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)
        model.sample()
        # train_queue.dataset.weights_index = model.sample_ops_weights_index
        # train_queue.dataset.probabilities_index = model.sample_probabilities_index
        train_queue.dataset.weights_index = model.ops_weights_b
        train_queue.dataset.probabilities_index = model.probabilities_b

    return top1.avg, objs.avg
Beispiel #23
0
def train(epoch, train_queue, valid_queue, model, architect, criterion,
          optimizer, metrics, scheduler, analyser):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    lr = scheduler.lr_vector
    layers_todo = metrics.layers_index_todo

    for step, (input, target) in enumerate(train_queue):
        # one mini-batch
        logging.info('train mini batch %03d', step)
        model.train()
        n = input.size(0)

        # input = Variable(input, requires_grad=False).cuda()
        # target = Variable(target, requires_grad=False).cuda(async=True)
        input = Variable(input, requires_grad=False).to(device)
        target = Variable(target, requires_grad=False).to(device)

        # get a random minibatch from the search queue with replacement
        input_search, target_search = next(iter(valid_queue))
        # input_search = Variable(input_search, requires_grad=False).cuda()
        # target_search = Variable(target_search, requires_grad=False).cuda(async=True)
        input_search = Variable(input_search, requires_grad=False).to(device)
        target_search = Variable(target_search, requires_grad=False).to(device)

        logging.info('update arch...')
        architect.step(input,
                       target,
                       input_search,
                       target_search,
                       lr,
                       layers_todo,
                       optimizer,
                       unrolled=args.unrolled)

        logging.info('update weights...')
        optimizer.zero_grad()
        """gdas"""
        logits = model.forward(input, gumbel=args.gumbel)
        loss = criterion(logits, target)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        # optimizer.step()
        ################################################################################
        # AdaS: update optimizer
        optimizer.step(layers_todo, scheduler.lr_vector)
        ################################################################################

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        objs.update(loss.item(), n)
        top1.update(prec1.item(), n)
        top5.update(prec5.item(), n)

        if step % args.report_freq == 0:
            logging.info('train %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)

    if args.compute_hessian:
        _data_loader = deepcopy(train_queue)
        input, target = next(iter(_data_loader))

        # input = Variable(input, requires_grad=False).cuda()
        # target = Variable(target, requires_grad=False).cuda(async=True)
        input = Variable(input, requires_grad=False).to(device)
        target = Variable(target, requires_grad=False).to(device)

        # get gradient information
        # param_grads = [p.grad for p in model.parameters() if p.grad is not None]
        # param_grads = torch.cat([x.view(-1) for x in param_grads])
        # param_grads = param_grads.cpu().data.numpy()
        # grad_norm = np.linalg.norm(param_grads)

        # gradient_vector = torch.cat([x.view(-1) for x in gradient_vector])
        # grad_norm = LA.norm(gradient_vector.cpu())
        # logging.info('\nCurrent grad norm based on Train Dataset: %.4f',
        #             grad_norm)
        # logging.info('Compute Hessian start')
        H = analyser.compute_Hw(input,
                                target,
                                input_search,
                                target_search,
                                lr,
                                layers_todo,
                                optimizer,
                                unrolled=False)
        # g = analyser.compute_dw(input, target, input_search, target_search,
        #                         lr, layers_todo, optimizer, unrolled=False)
        # g = torch.cat([x.view(-1) for x in g])

        del _data_loader
        # logging.info('Compute Hessian finished')
        # HESSIAN_STATISTICS[f'hessian_epoch{epoch}'] = weights_normal[:, 0]
        hessian_file = "../save_data/hessian_adas_c100_{0}_epoch_{1}".format(
            args.file_name, epoch)
        np.save(hessian_file, H.cpu().data.numpy())
        # logging.info('Writing Hessian finished')

    return top1.avg, objs.avg
Beispiel #24
0
                          train=False,
                          download=True,
                          transform=train_transform)

num_train = len(train_data)
indices = list(range(num_train))
split = int(np.floor(0.5 * num_train))

train_queue = torch.utils.data.DataLoader(
    train_data,
    batch_size=64,
    sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:]),
    pin_memory=True,
    num_workers=2)

top1 = utils.AvgrageMeter()
top5 = utils.AvgrageMeter()

for step, (input, target) in enumerate(train_queue):
    input = Variable(input).cuda()
    target = Variable(target).cuda()

    input_pert = ifgsm(model,
                       input,
                       target,
                       epsilon=args.eps,
                       niters=args.niters,
                       learning_rate=args.adv_rate)
    input_pert = input_pert.detach()
    logits = model(input_pert)
Beispiel #25
0
def infer(valid_queue, model, criterion):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    model.eval()
    preds = np.asarray([])
    targets = np.asarray([])

    for step, (input, target) in enumerate(valid_queue):
        #input = input.cuda()
        #target = target.cuda(non_blocking=True)
        input = Variable(input, volatile=True).cuda()
        target = Variable(target, volatile=True).cuda(async=True)
        logits = model(input)
        loss = criterion(logits, target)

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        n = input.size(0)
        #objs.update(loss.data[0], n)
        #top1.update(prec1.data[0], n)
        #top5.update(prec5.data[0], n)
        objs.update(loss.item(), n)
        top1.update(prec1.item(), n)
        top5.update(prec5.item(), n)

        #minha alteracao
        output = logits
        topk = (1, 3)
        maxk = max(topk)
        batch_size = target.size(0)
        _, predicted = torch.max(output.data, 1)
        #minha alteracao
        preds = np.concatenate((preds, predicted.cpu().numpy().ravel()))
        #targets = np.concatenate((targets,target.cpu().numpy().ravel()))
        targets = np.concatenate((targets, target.data.cpu().numpy().ravel()))

        if step % args.report_freq == 0:
            logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)

    print(preds.shape)
    print(targets.shape)
    print('np.unique(targets):', np.unique(targets))
    print('np.unique(preds): ', np.unique(preds))
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score
    print(accuracy_score(targets, preds))
    cr = classification_report(targets, preds, output_dict=True)
    a1, a2, a3 = cr['macro avg']['f1-score'], cr['macro avg']['precision'], cr[
        'macro avg']['recall']
    topover = (a1 + a2 + a3) / 3
    print(classification_report(targets, preds))
    from sklearn.metrics import balanced_accuracy_score
    from sklearn.metrics import accuracy_score
    print(balanced_accuracy_score(targets, preds))
    print(accuracy_score(targets, preds))
    from sklearn.metrics import confusion_matrix
    matrix = confusion_matrix(targets, preds)
    print(matrix.diagonal() / matrix.sum(axis=1))
    print(matrix)

    return top1.avg, objs.avg
def main():
    args = parse_args()
    preparelog(args)
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.cuda.set_device(args.gpu)
    torch.backends.cudnn.benchmark = True
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    controller = Controller(args)
    controller.cuda()

    controller_optimizer = torch.optim.Adam(
        controller.parameters(),
        args.controller_lr,
        betas=(0.1, 0.999),
        eps=1e-3,
    )

    train_loader, valid_loader, test_loader = get_loaders(args)
    total_loss = utils.AvgrageMeter()
    total_reward = utils.AvgrageMeter()
    total_entropy = utils.AvgrageMeter()

    base_model = build_basemodel()
    baseline = model_evaluate(base_model, valid_loader)

    controller.train()
    for step in range(args.total_iter):
        controller_optimizer.zero_grad()
        model_para, log_prob, entropy = controller()

        model = model_transform(base_model, model_para)
        model_finetune(model, train_loader)
        with torch.no_grad():
            reward = model_evaluate(model, valid_loader)

        #if args.entropy_weight is not None:
        #    reward += args.entropy_weight*entropy

        log_prob = torch.sum(log_prob)
        loss = log_prob * (reward - baseline)
        loss = loss.sum()
        loss.backward()
        controller_optimizer.step()

        total_loss.update(loss.item(), 1)
        total_reward.update(reward.item(), 1)
        total_entropy.update(entropy.item(), 1)

        if step % args.report_freq == 0:
            #logging.info('controller %03d %e %f %f', step, loss.item(), reward.item(), baseline.item())
            logging.info('controller %03d %e %f %f', step, total_loss.avg,
                         total_reward.avg, baseline.item())
Beispiel #27
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--fast-run', action='store_true', default=False)
    parser.add_argument('--local', action='store_true', default=False)
    parser.add_argument('-c',
                        '--continue',
                        dest='continue_path',
                        required=False)
    args = parser.parse_args()

    cudnn.benchmark = True
    cudnn.enabled = True

    net = model.Network()
    # logger.info(net)
    net = nn.DataParallel(net).cuda()

    # create session
    sess = Session(train_spec, net=net)

    # worklog = WorklogLogger(os.path.join(sess.log_dir, 'worklog.txt'))

    criterion = utils.Denseloss(dropout=5)
    criterion = criterion.cuda()

    all_parameters = net.parameters()

    optimizer = torch.optim.Adam([{
        'params': all_parameters,
        'weight_decay': train_spec.weight_decay,
        'lr': train_spec.learning_rate
    }], )

    adam_opt = torch.optim.Adam(net.parameters(),
                                lr=1e-3,
                                betas=(0.9, 0.999),
                                eps=1e-8,
                                weight_decay=0,
                                amsgrad=False)

    def adjust_lr(epoch, step):
        lr = train_spec.get_learning_rate(epoch, step)
        for params_group in adam_opt.param_groups:
            params_group['lr'] = lr
        return lr

    # Now start train
    clock = sess.clock
    clock.epoch = 0
    clock.step = 0 * 1024
    sess.start()

    # restore checkpoint
    checkpoint = torch.load(train_spec.imagenet_path)
    sess.net.load_state_dict(checkpoint['state_state'], strict=False)

    if args.continue_path and os.path.exists(args.continue_path):
        sess.load_checkpoint(args.continue_path)
    for ite in range(sess.clock.step):
        adam_opt.step()

    # log_output = log_rate_limited(min_interval=1)(worklog.put_line)

    CASI_dataset = CASIADataset('train')
    dataloader = DataLoader(dataset=CASI_dataset,
                            batch_size=train_spec.minibatch_size,
                            shuffle=False,
                            num_workers=8)

    # for epoch in train_ds.epoch_generator():
    for epoch in range(train_spec.stop_epoch):
        # if clock.epoch > train_spec.stop_epoch:
        #     break
        time_epoch_start = tstart = time.time()
        step = 0
        sess.net.train()
        adjust_lr(epoch, clock.step)
        objs = utils.AvgrageMeter()
        top1 = utils.AvgrageMeter()
        # for step in range(train_spec.minibatch_per_epoch):
        for step in range(train_spec.minibatch_per_epoch):

            minibatch = next(iter(dataloader))
            # scheduler.step()
            adam_opt.step()
            # input_data = minibatch['depth']
            # target = minibatch['label']
            input_data = minibatch[0]
            target = minibatch[1]
            input_data = input_data.type(torch.FloatTensor)
            # target = torch.from_numpy(target).type(torch.LongTensor)
            target = target.type(torch.LongTensor)
            input_data = Variable(input_data).cuda()
            target = Variable(target).cuda(async=True)
            tdata = time.time() - tstart

            optimizer.zero_grad()
            dense_pred = sess.net(input_data)
            pred = dense_pred.mean(dim=1)
            loss = criterion(dense_pred, target)
            loss.backward()
            optimizer.step()

            cur_time = time.time()
            ttrain = cur_time - tstart
            time_passed = cur_time - time_epoch_start

            # time_expected = time_passed / (clock.minibatch + 1) * train_ds.minibatch_per_epoch
            time_expected = time_passed / (clock.minibatch +
                                           1) * train_spec.minibatch_per_epoch
            eta = time_expected - time_passed

            prec1, = utils.accuracy(pred, target, topk=(1, ))

            n = input_data.size(0)
            objs.update(loss.item(), n)  # accumulated loss
            top1.update(prec1.item(), n)

            for param_group in optimizer.param_groups:
                cur_lr = param_group['lr']
            outputs = [
                # "e:{},{}/{}".format(clock.epoch, clock.minibatch, train_ds.minibatch_per_epoch),
                "e:{},{}/{}".format(clock.epoch, clock.minibatch,
                                    train_spec.minibatch_per_epoch),
                "{:.2g} mb/s".format(1. / ttrain),
            ] + [
                "lr:{:.6f}, loss:{:.3f}, top1_acc:{:.2f}%".format(
                    cur_lr, objs.avg, top1.avg)
            ] + [
                'passed:{:.2f}'.format(time_passed),
                'eta:{:.2f}'.format(eta),
            ]
            if tdata / ttrain > .05:
                outputs += ["dp/tot: {:.2g}".format(tdata / ttrain)]
            print(outputs)
            # log_output(' '.join(outputs))
            clock.tick()
            tstart = time.time()
            # sess.save_checkpoint('epoch_{}_{}'.format(clock.epoch, clock.step))

            # sess.save_checkpoint('epoch_{}'.format(clock.epoch))

        clock.tock()

        if clock.epoch % train_spec.dump_epoch_interval == 0:
            sess.save_checkpoint('epoch_{}'.format(clock.epoch))
        sess.save_checkpoint('latest')
Beispiel #28
0
    def __init__(self,
                 save_path,
                 seed,
                 batch_size,
                 grad_clip,
                 epochs,
                 resume_iter=None,
                 init_channels=16):
        args = {}
        args['data'] = '/data/mzhang3/randomNAS_own/data'
        args['epochs'] = epochs
        args['learning_rate'] = 0.025
        args['batch_size'] = batch_size
        args['learning_rate_min'] = 0.001
        args['momentum'] = 0.9
        args['weight_decay'] = 3e-4
        args['init_channels'] = init_channels
        args['layers'] = 8
        args['drop_path_prob'] = 0.3
        args['grad_clip'] = grad_clip
        args['train_portion'] = 0.5
        args['seed'] = seed
        args['log_interval'] = 50
        args['save'] = save_path
        args['gpu'] = 0
        args['cuda'] = True
        args['cutout'] = False
        args['cutout_length'] = 16
        args['report_freq'] = 50
        args = AttrDict(args)
        self.args = args
        self.seed = seed

        np.random.seed(args.seed)
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = False
        cudnn.enabled = True
        cudnn.deterministic = True
        torch.cuda.manual_seed_all(args.seed)

        train_transform, valid_transform = utils._data_transforms_cifar10(args)
        train_data = dset.CIFAR10(root=args.data,
                                  train=True,
                                  download=False,
                                  transform=train_transform)

        num_train = len(train_data)
        indices = list(range(num_train))
        split = int(np.floor(args.train_portion * num_train))

        self.train_queue = torch.utils.data.DataLoader(
            train_data,
            batch_size=args.batch_size,
            sampler=torch.utils.data.sampler.SubsetRandomSampler(
                indices[:split]),
            pin_memory=True,
            num_workers=0,
            worker_init_fn=np.random.seed(args.seed))

        self.valid_queue = torch.utils.data.DataLoader(
            train_data,
            batch_size=32,
            sampler=torch.utils.data.sampler.SubsetRandomSampler(
                indices[split:num_train]),
            pin_memory=True,
            num_workers=0,
            worker_init_fn=np.random.seed(args.seed))

        self.train_iter = iter(self.train_queue)
        self.valid_iter = iter(self.valid_queue)

        self.steps = 0
        self.epochs = 0
        self.total_loss = 0
        self.start_time = time.time()
        criterion = nn.CrossEntropyLoss()
        criterion = criterion.cuda()
        self.criterion = criterion

        model = Network(args.init_channels, 10, args.layers, self.criterion)

        model = model.cuda()
        self.model = model

        #   try:
        #    self.load()
        #      logging.info('loaded previously saved weights')
        #  except Exception as e:
        #      print(e)

        logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

        optimizer = torch.optim.SGD(self.model.parameters(),
                                    args.learning_rate,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
        self.optimizer = optimizer

        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, float(args.epochs), eta_min=args.learning_rate_min)

        if resume_iter is not None:
            self.steps = resume_iter
            self.epochs = int(resume_iter / len(self.train_queue))
            logging.info("Resuming from epoch %d" % self.epochs)
            self.objs = utils.AvgrageMeter()
            self.top1 = utils.AvgrageMeter()
            self.top5 = utils.AvgrageMeter()
            for i in range(self.epochs):
                self.scheduler.step()

        size = 0
        for p in model.parameters():
            size += p.nelement()
        logging.info('param size: {}'.format(size))

        total_params = sum(x.data.nelement() for x in model.parameters())
        logging.info('Args: {}'.format(args))
        logging.info('Model total parameters: {}'.format(total_params))
Beispiel #29
0
def train(train_queue, valid_queue, model, architect, criterion, optimizer,
          lr):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()

    for step, (input, target) in enumerate(train_queue):
        model.train()
        n = input.size(0)

        input = Variable(input).cuda()
        target = Variable(target).cuda(async=True)

        input_pert = adv_attacks[args.adv_train](model,
                                                 target,
                                                 input,
                                                 niters=args.niters_,
                                                 epsilon=args.eps_,
                                                 learning_rate=args.adv_rate_)

        # get a random minibatch from the search queue with replacement
        input_search, target_search = next(iter(valid_queue))
        input_search = Variable(input_search).cuda()
        target_search = Variable(target_search).cuda(async=True)
        input_search = ifgsm(model,
                             input_search,
                             target_search,
                             niters=args.niters,
                             epsilon=args.eps,
                             learning_rate=args.adv_rate)

        input_comb = torch.cat([input, input_pert]).cuda()
        target_comb = torch.cat([target, target]).cuda()
        architect.step(input_comb,
                       target_comb,
                       input_search,
                       target_search,
                       lr,
                       optimizer,
                       unrolled=args.unrolled)

        optimizer.zero_grad()
        logits = model(input)
        loss = criterion(logits, target)

        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
        optimizer.step()

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        objs.update(loss.data[0], n)
        top1.update(prec1.data[0], n)
        top5.update(prec5.data[0], n)

        if step % args.report_freq == 0:
            logging.info('train %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)

        del input
        del target

    return top1.avg, objs.avg
def main():
	global_step = tf.train.get_or_create_global_step()

	images, labels = read_data(args.data)
	train_dataset = tf.data.Dataset.from_tensor_slices((images["train"],labels["train"]))
	train_dataset=train_dataset.map(_pre_process).shuffle(5000).batch(args.batch_size)
	train_iter=train_dataset.make_initializable_iterator()
	x_train,y_train=train_iter.get_next()

	test_dataset = tf.data.Dataset.from_tensor_slices((images["test"],labels["test"]))
	test_dataset=test_dataset.shuffle(5000).batch(args.batch_size)
	test_iter=test_dataset.make_initializable_iterator()
	x_test,y_test=test_iter.get_next()

	genotype = eval("genotypes.%s" % args.arch)
	train_logits,aux_logits=Model(x_train,y_train,True,args.init_channels,CLASS_NUM,args.layers,args.auxiliary,genotype)
	train_loss=tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_train, logits=train_logits))

	w_regularization_loss = tf.add_n(utils.get_var(tf.losses.get_regularization_losses(), 'lw')[1])
	train_loss+=1e4*args.weight_decay*w_regularization_loss
	# tf.summary.scalar('train_loss', train_loss)

	if args.auxiliary:
		loss_aux = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_train, logits=aux_logits))
		train_loss += args.auxiliary_weight*loss_aux

	lr=tf.train.cosine_decay(args.learning_rate,global_step,50000/args.batch_size*args.epochs)
	accuracy=tf.reduce_mean(tf.cast(tf.nn.in_top_k(train_logits, y_train, 1), tf.float32))	

	test_logits,_=Model(x_test,y_test,False,args.init_channels,CLASS_NUM,args.layers,args.auxiliary,genotype)
	test_accuracy=tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, y_test, 1), tf.float32))
	test_accuracy_top5=tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, y_test, 5), tf.float32))
	tf.summary.scalar('test_accuracy_top1', test_accuracy)


	with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
		opt=tf.train.MomentumOptimizer(lr,args.momentum)
		opt=opt.minimize(train_loss,global_step)

	merged = tf.summary.merge_all()


	config = tf.ConfigProto()
	os.environ["CUDA_VISIBLE_DEVICES"] =  str(args.gpu)
	config.gpu_options.allow_growth = True
	sess=tf.Session(config=config)

	writer = tf.summary.FileWriter(output_dir+TIMESTAMP,sess.graph)
	saver = tf.train.Saver(max_to_keep=1)
	sess.run(tf.global_variables_initializer())
	test_batch=0
	for e in range(args.epochs):
		objs = utils.AvgrageMeter()
		top1 = utils.AvgrageMeter()
		sess.run(train_iter.initializer)
		while True:
			try:
				_,loss, acc,crrunt_lr,gs=sess.run([opt,train_loss,accuracy,lr,global_step])
				objs.update(loss, args.batch_size)
				top1.update(acc, args.batch_size)
				if gs % args.report_freq==0:
					print("epochs {} steps {} currnt lr is {:.3f}  loss is {}  train_acc is {}".format(e,gs,crrunt_lr,objs.avg,top1.avg))
			except tf.errors.OutOfRangeError:
				print('-'*80)
				print("end of an train epoch")
				break
		if e % 5 ==0:
			test_top1 = utils.AvgrageMeter()
			sess.run(test_iter.initializer)
			while True:
				try:
					test_batch+=1
					summary,test_acc=sess.run([merged,test_accuracy])
					test_top1.update(test_acc, args.batch_size)
					if test_batch % 100:
						writer.add_summary(summary, test_batch)
				except tf.errors.OutOfRangeError:
					print("******************* epochs {}   test_acc is {}".format(e,test_top1.avg))
					saver.save(sess, output_dir+"model",test_batch)
					print('-'*80)
					print("end of an test epoch")
					break