def eval_one_epoch(net,
                   batch_generator,
                   DEVICE=torch.device('cuda:0'),
                   AttackMethod=None):
    net.eval()
    pbar = tqdm(batch_generator)
    clean_accuracy = AvgMeter()
    adv_accuracy = AvgMeter()

    pbar.set_description('Evaluating')
    for (data, label) in pbar:
        data = data.to(DEVICE)
        label = label.to(DEVICE)

        with torch.no_grad():
            pred = net(data)
            acc = torch_accuracy(pred, label, (1, ))
            clean_accuracy.update(acc[0].item())

        if AttackMethod is not None:
            adv_inp = AttackMethod.attack(net, data, label)

            with torch.no_grad():
                pred = net(adv_inp)
                acc = torch_accuracy(pred, label, (1, ))
                adv_accuracy.update(acc[0].item())

        pbar_dic = OrderedDict()
        pbar_dic['CleanAcc'] = '{:.2f}'.format(clean_accuracy.mean)
        pbar_dic['AdvAcc'] = '{:.2f}'.format(adv_accuracy.mean)

        pbar.set_postfix(pbar_dic)

        adv_acc = adv_accuracy.mean if AttackMethod is not None else 0
    return clean_accuracy.mean, adv_acc
Ejemplo n.º 2
0
def my_eval_one_epoch(net,
                      batch_generator,
                      DEVICE=torch.device('cuda:0'),
                      AttackMethod=None):
    net.eval()
    pbar = tqdm(batch_generator)
    clean_accuracy = AvgMeter()
    adv_accuracy = AvgMeter()
    correct_indices = None
    natural_indices = None

    pbar.set_description('Evaluating')
    for (data, label) in pbar:
        data = data.to(DEVICE)
        label = label.to(DEVICE)

        with torch.no_grad():
            pred = net(data)
            predictions = np.argmax(pred.cpu().numpy(), axis=1)
            correct_labels = label.cpu().numpy()
            if natural_indices is None:
                natural_indices = np.where(predictions == correct_labels)[0]
            else:
                natural_indices = np.append(
                    natural_indices,
                    np.where(predictions == correct_labels)[0])

            acc = torch_accuracy(pred, label, (1, ))
            clean_accuracy.update(acc[0].item())

        if AttackMethod is not None:
            adv_inp = AttackMethod.attack(net, data, label)

            with torch.no_grad():
                pred = net(adv_inp)
                predictions = np.argmax(pred.cpu().numpy(), axis=1)
                correct_labels = label.cpu().numpy()
                if correct_indices is None:
                    correct_indices = np.where(
                        predictions == correct_labels)[0]
                else:
                    correct_indices = np.append(
                        correct_indices,
                        np.where(predictions == correct_labels)[0])

                acc = my_torch_accuracy(pred, label, (1, ))
                adv_accuracy.update(acc[0].item())

        pbar_dic = OrderedDict()
        pbar_dic['CleanAcc'] = '{:.2f}'.format(clean_accuracy.mean)
        pbar_dic['AdvAcc'] = '{:.2f}'.format(adv_accuracy.mean)

        pbar.set_postfix(pbar_dic)

        adv_acc = adv_accuracy.mean if AttackMethod is not None else 0

    print('Natural Samples', natural_indices.shape)
    print('Adversarial Samples', correct_indices.shape)

    return clean_accuracy.mean, adv_acc
Ejemplo n.º 3
0
    def train(self):
        for curr_epoch in range(self.start_epoch, self.end_epoch):
            train_loss_record = AvgMeter()
            self._train_per_epoch(curr_epoch, train_loss_record)

            # 根据周期修改学习率
            if not self.arg_dict["sche_usebatch"]:
                self.sche.step()

            # 每个周期都进行保存测试,保存的是针对第curr_epoch+1周期的参数
            save_checkpoint(
                model=self.net,
                optimizer=self.opti,
                scheduler=self.sche,
                amp=self.amp,
                exp_name=self.exp_name,
                current_epoch=curr_epoch + 1,
                full_net_path=self.path_dict["final_full_net"],
                state_net_path=self.path_dict["final_state_net"],
            )  # 保存参数

        if self.arg_dict["use_amp"]:
            # https://github.com/NVIDIA/apex/issues/567
            with self.amp.disable_casts():
                construct_print(
                    "When evaluating, we wish to evaluate in pure fp32.")
                self.test()
        else:
            self.test()
Ejemplo n.º 4
0
    def test(self, save_pre):
        if self.only_test:
            self.resume_checkpoint(load_path=self.pth_path, mode='onlynet')
        self.net.eval()

        loader = self.te_loader

        pres = [AvgMeter() for _ in range(256)]
        recs = [AvgMeter() for _ in range(256)]
        meanfs = AvgMeter()
        maes = AvgMeter()

        tqdm_iter = tqdm(enumerate(loader), total=len(loader), leave=False)
        for test_batch_id, test_data in tqdm_iter:
            tqdm_iter.set_description(
                f"{self.model_name}: te=>{test_batch_id + 1}")
            with torch.no_grad():
                in_imgs, in_names, in_mask_paths = test_data
                in_imgs = in_imgs.to(self.dev, non_blocking=True)
                outputs = self.net(in_imgs)

            outputs_np = outputs.cpu().detach()

            for item_id, out_item in enumerate(outputs_np):
                gimg_path = osp.join(in_mask_paths[item_id])
                gt_img = Image.open(gimg_path).convert("L")
                out_img = self.to_pil(out_item).resize(gt_img.size)

                if save_pre:
                    oimg_path = osp.join(self.save_path,
                                         in_names[item_id] + ".png")
                    out_img.save(oimg_path)

                gt_img = np.asarray(gt_img)
                out_img = np.array(out_img)
                ps, rs, mae, meanf = cal_pr_mae_meanf(out_img, gt_img)
                for pidx, pdata in enumerate(zip(ps, rs)):
                    p, r = pdata
                    pres[pidx].update(p)
                    recs[pidx].update(r)
                maes.update(mae)
                meanfs.update(meanf)
        maxf = cal_maxf([pre.avg for pre in pres], [rec.avg for rec in recs])
        results = {"MAXF": maxf, "MEANF": meanfs.avg, "MAE": maes.avg}
        return results
Ejemplo n.º 5
0
def eval_one_epoch(net, batch_generator, DEVICE=torch.device('cuda:0')):
    net.eval()
    pbar = tqdm(batch_generator)
    clean_accuracy = AvgMeter()

    pbar.set_description('Evaluating')
    for (data, label) in pbar:
        data = data.to(DEVICE)
        label = label.to(DEVICE)

        with torch.no_grad():
            pred = net(data)
            acc = torch_accuracy(pred, label, (1, ))
            clean_accuracy.update(acc[0].item())

        pbar_dic = OrderedDict()
        pbar_dic['CleanAcc'] = '{:.2f}'.format(clean_accuracy.mean)

        pbar.set_postfix(pbar_dic)

    return clean_accuracy.mean
def train(args,
          model,
          device,
          train_loader,
          optimizer,
          epoch,
          descrip_str='Training'):
    model.train()
    pbar = tqdm(train_loader)
    pbar.set_description(descrip_str)

    CleanAccMeter = AvgMeter()
    TradesAccMeter = AvgMeter()
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()

        # calculate robust loss
        loss, cleanloss, klloss, cleanacc, tradesacc = trades_loss(
            model=model,
            x_natural=data,
            y=target,
            optimizer=optimizer,
            device=device,
            step_size=args.step_size,
            epsilon=args.epsilon,
            perturb_steps=args.num_steps,
            beta=args.beta,
        )
        loss.backward()
        optimizer.step()

        CleanAccMeter.update(cleanacc)
        TradesAccMeter.update(tradesacc)

        pbar_dic = OrderedDict()
        pbar_dic['cleanloss'] = '{:.3f}'.format(cleanloss)
        pbar_dic['klloss'] = '{:.3f}'.format(klloss)
        pbar_dic['CleanAcc'] = '{:.2f}'.format(CleanAccMeter.mean)
        pbar_dic['TradesAcc'] = '{:.2f}'.format(TradesAccMeter.mean)
        pbar.set_postfix(pbar_dic)
Ejemplo n.º 7
0
def train_main(local_rank,
               cfg: BaseConfigByEpoch,
               net=None,
               train_dataloader=None,
               val_dataloader=None,
               show_variables=False,
               convbuilder=None,
               init_hdf5=None,
               no_l2_keywords='depth',
               gradient_mask=None,
               use_nesterov=False,
               tensorflow_style_init=False,
               load_weights_keyword=None,
               keyword_to_lr_mult=None,
               auto_continue=False,
               lasso_keyword_to_strength=None,
               save_hdf5_epochs=10000):

    if no_l2_keywords is None:
        no_l2_keywords = []
    if type(no_l2_keywords) is not list:
        no_l2_keywords = [no_l2_keywords]

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    with Engine(local_rank=local_rank) as engine:
        engine.setup_log(name='train',
                         log_dir=cfg.output_dir,
                         file_name='log.txt')

        # ----------------------------- build model ------------------------------
        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)
        if net is None:
            net_fn = get_model_fn(cfg.dataset_name, cfg.network_type)
            model = net_fn(cfg, convbuilder)
        else:
            model = net
        model = model.cuda()
        # ----------------------------- model done ------------------------------

        # ---------------------------- prepare data -------------------------
        if train_dataloader is None:
            train_data = create_dataset(cfg.dataset_name,
                                        cfg.dataset_subset,
                                        cfg.global_batch_size,
                                        distributed=engine.distributed)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_data = create_dataset(cfg.dataset_name,
                                      'val',
                                      global_batch_size=100,
                                      distributed=False)
        engine.echo('NOTE: Data prepared')
        engine.echo(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))
        # ----------------------------- data done --------------------------------

        # ------------------------ parepare optimizer, scheduler, criterion -------
        optimizer = get_optimizer(engine,
                                  cfg,
                                  model,
                                  no_l2_keywords=no_l2_keywords,
                                  use_nesterov=use_nesterov,
                                  keyword_to_lr_mult=keyword_to_lr_mult)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()
        # --------------------------------- done -------------------------------

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer)

        if engine.distributed:
            torch.cuda.set_device(local_rank)
            engine.echo('Distributed training, device {}'.format(local_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                broadcast_buffers=False,
            )
        else:
            assert torch.cuda.device_count() == 1
            engine.echo('Single GPU training')

        if tensorflow_style_init:
            init_as_tensorflow(model)
        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights)
        if init_hdf5:
            engine.load_hdf5(init_hdf5,
                             load_weights_keyword=load_weights_keyword)
        if auto_continue:
            assert cfg.init_weights is None
            engine.load_checkpoint(get_last_checkpoint(cfg.output_dir))
        if show_variables:
            engine.show_variables()

        # ------------ do training ---------------------------- #
        engine.log("\n\nStart training with pytorch version {}".format(
            torch.__version__))

        iteration = engine.state.iteration
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch
        last_epoch_done_iters = iteration % iters_per_epoch

        if done_epochs == 0 and last_epoch_done_iters == 0:
            engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

        recorded_train_time = 0
        recorded_train_examples = 0

        collected_train_loss_sum = 0
        collected_train_loss_count = 0

        if gradient_mask is not None:
            gradient_mask_tensor = {}
            for name, value in gradient_mask.items():
                gradient_mask_tensor[name] = torch.Tensor(value).cuda()
        else:
            gradient_mask_tensor = None

        for epoch in range(done_epochs, cfg.max_epochs):

            if engine.distributed and hasattr(train_data, 'train_sampler'):
                train_data.train_sampler.set_epoch(epoch)

            if epoch == done_epochs:
                pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters))
            else:
                pbar = tqdm(range(iters_per_epoch))

            if epoch == 0 and local_rank == 0:
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str='Init',
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)

            for _ in pbar:

                start_time = time.time()
                data, label = load_cuda_data(train_data,
                                             dataset_name=cfg.dataset_name)

                # load_cuda_data(train_dataloader, cfg.dataset_name)
                data_time = time.time() - start_time

                if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0)

                train_net_time_start = time.time()
                acc, acc5, loss = train_one_step(
                    model,
                    data,
                    label,
                    optimizer,
                    criterion,
                    if_accum_grad,
                    gradient_mask_tensor=gradient_mask_tensor,
                    lasso_keyword_to_strength=lasso_keyword_to_strength)
                train_net_time_end = time.time()

                if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                    recorded_train_examples += cfg.global_batch_size
                    recorded_train_time += train_net_time_end - train_net_time_start

                scheduler.step()

                for module in model.modules():
                    if hasattr(module, 'set_cur_iter'):
                        module.set_cur_iter(iteration)

                if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0:
                    for tag, value in zip(
                            tb_tags,
                        [acc.item(), acc5.item(),
                         loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)

                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS:
                    collected_train_loss_sum += loss.item()
                    collected_train_loss_count += 1

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                iteration += 1

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed and
                                                    engine.world_rank == 0):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            engine.update_iteration(iteration)
            engine.save_latest_ckpt(cfg.output_dir)

            if (epoch + 1) % save_hdf5_epochs == 0:
                engine.save_hdf5(
                    os.path.join(cfg.output_dir,
                                 'epoch-{}.hdf5'.format(epoch)))

            if local_rank == 0 and \
                    cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0):
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str=discrip_str,
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            if iteration >= max_iters:
                break

        #   do something after the training
        if recorded_train_time > 0:
            exp_per_sec = recorded_train_examples / recorded_train_time
        else:
            exp_per_sec = 0
        engine.log(
            'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
            .format(int(TRAIN_SPEED_START * max_iters),
                    int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size,
                    recorded_train_examples, recorded_train_time, exp_per_sec))
        if cfg.save_weights:
            engine.save_checkpoint(cfg.save_weights)
            print('NOTE: training finished, saved to {}'.format(
                cfg.save_weights))
        engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
        if collected_train_loss_count > 0:
            engine.log(
                'TRAIN LOSS collected over last {} epochs: {:.6f}'.format(
                    COLLECT_TRAIN_LOSS_EPOCHS,
                    collected_train_loss_sum / collected_train_loss_count))
Ejemplo n.º 8
0
    def train(self):
        for curr_epoch in range(self.start_epoch, self.end_epoch):
            train_loss_record = AvgMeter()
            for train_batch_id, train_data in enumerate(self.tr_loader):
                curr_iter = curr_epoch * len(self.tr_loader) + train_batch_id

                self.opti.zero_grad()
                train_inputs, train_masks, *train_other_data = train_data
                train_inputs = train_inputs.to(self.dev, non_blocking=True)
                train_masks = train_masks.to(self.dev, non_blocking=True)
                train_preds = self.net(train_inputs)

                train_loss, loss_item_list = self.total_loss(
                    train_preds, train_masks)
                train_loss.backward()
                self.opti.step()

                if self.args["sche_usebatch"]:
                    if self.args["lr_type"] == "poly":
                        self.sche.step(curr_iter + 1)
                    else:
                        raise NotImplementedError

                # 仅在累计的时候使用item()获取数据
                train_iter_loss = train_loss.item()
                train_batch_size = train_inputs.size(0)
                train_loss_record.update(train_iter_loss, train_batch_size)

                # 显示tensorboard
                if (self.args["tb_update"] > 0
                        and (curr_iter + 1) % self.args["tb_update"] == 0):
                    self.tb.add_scalar("data/trloss_avg",
                                       train_loss_record.avg, curr_iter)
                    self.tb.add_scalar("data/trloss_iter", train_iter_loss,
                                       curr_iter)
                    self.tb.add_scalar("data/trlr",
                                       self.opti.param_groups[0]["lr"],
                                       curr_iter)
                    tr_tb_mask = make_grid(train_masks,
                                           nrow=train_batch_size,
                                           padding=5)
                    self.tb.add_image("trmasks", tr_tb_mask, curr_iter)
                    tr_tb_out_1 = make_grid(train_preds,
                                            nrow=train_batch_size,
                                            padding=5)
                    self.tb.add_image("trsodout", tr_tb_out_1, curr_iter)

                # 记录每一次迭代的数据
                if (self.args["print_freq"] > 0
                        and (curr_iter + 1) % self.args["print_freq"] == 0):
                    log = (
                        f"[I:{curr_iter}/{self.iter_num}][E:{curr_epoch}:{self.end_epoch}]>"
                        f"[{self.model_name}]"
                        f"[Lr:{self.opti.param_groups[0]['lr']:.7f}]"
                        f"[Avg:{train_loss_record.avg:.5f}|Cur:{train_iter_loss:.5f}|"
                        f"{loss_item_list}]")
                    print(log)
                    make_log(self.path["tr_log"], log)

            # 根据周期修改学习率
            if not self.args["sche_usebatch"]:
                if self.args["lr_type"] == "poly":
                    self.sche.step(curr_epoch + 1)
                else:
                    raise NotImplementedError

            # 每个周期都进行保存测试,保存的是针对第curr_epoch+1周期的参数
            self.save_checkpoint(
                curr_epoch + 1,
                full_net_path=self.path['final_full_net'],
                state_net_path=self.path['final_state_net'])  # 保存参数

        total_results = {}
        for data_name, data_path in self.te_data_list.items():
            construct_print(f"Testing with testset: {data_name}")
            self.te_loader, self.te_length = create_loader(data_path=data_path,
                                                           mode='test',
                                                           get_length=True)
            self.save_path = os.path.join(self.path["save"], data_name)
            if not os.path.exists(self.save_path):
                construct_print(
                    f"{self.save_path} do not exist. Let's create it.")
                os.makedirs(self.save_path)
            results = self.test(save_pre=self.save_pre)
            msg = (
                f"Results on the testset({data_name}:'{data_path}'): {results}"
            )
            construct_print(msg)
            make_log(self.path["te_log"], msg)

            total_results[data_name.upper()] = results
        # save result into xlsx file.
        write_xlsx(self.model_name, total_results)
Ejemplo n.º 9
0
    def __call__(self, teacher_net: TriNet, student_net: TriNet):

        opt = Adam(student_net.parameters(), lr=self.lr(self._gen), weight_decay=1e-5)

        milestones = list(range(self.args.first_milestone, self.args.num_epochs,
                                self.args.step_milestone))

        scheduler = lr_scheduler.MultiStepLR(opt, milestones=milestones, gamma=self.args.gamma)

        for e in range(self.args.num_epochs):

            if e % self.args.eval_epoch_interval == 0 and e > 0:
                self.evaluate(student_net)

            avm = AvgMeter(['kl', 'triplet', 'class', 'similarity', 'loss'])

            student_net.student_mode()
            teacher_net.teacher_mode()

            for x, y, cams in self.train_loader:

                x, y = x.to(self.device), y.to(self.device)
                x_ = torch.stack([x[i, torch.randperm(x.shape[1])] for i in range(x.shape[0])])

                x_teacher, x_student = x, x_[:, :self.args.num_student_images]

                with torch.no_grad():
                    teacher_emb, teacher_logits = teacher_net(x_teacher, return_logits=True)

                opt.zero_grad()

                student_emb, student_logits = student_net(x_student, return_logits=True)

                kl_div_batch = self.distill_loss(teacher_logits, student_logits)
                similarity_loss_batch = self.similarity_loss(teacher_emb, student_emb)
                triplet_loss_batch = self.triplet_loss(student_emb, y)
                class_loss_batch = self.class_loss(student_logits, y)

                loss = (triplet_loss_batch + class_loss_batch) + \
                       self.args.lambda_coeff * (similarity_loss_batch) + \
                       self.args.kl_coeff * (kl_div_batch)

                avm.add([kl_div_batch.item(), triplet_loss_batch.item(),
                         class_loss_batch.item(), similarity_loss_batch.item(),
                         loss.item()])

                loss.backward()
                opt.step()

            scheduler.step()

            if self._epoch % self.args.print_epoch_interval == 0:
                stats = avm()
                str_ = f"Epoch: {self._epoch}"
                for (l, m) in stats:
                    str_ += f" - {l} {m:.2f}"
                    self.saver.dump_metric_tb(m, self._epoch, 'losses', f"avg_{l}")
                self.saver.dump_metric_tb(opt.defaults['lr'], self._epoch, 'lr', 'lr')
                print(str_)

            self._epoch += 1

        self._gen += 1

        return student_net
Ejemplo n.º 10
0
def ding_train(cfg: BaseConfigByEpoch,
               net=None,
               train_dataloader=None,
               val_dataloader=None,
               show_variables=False,
               convbuilder=None):

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    with Engine(cfg) as engine:

        is_main_process = (engine.world_rank == 0)  #TODO correct?

        logger = engine.setup_log(name='train',
                                  log_dir=cfg.output_dir,
                                  file_name='log.txt')

        # -- typical model components model, opt,  scheduler,  dataloder --#
        if net is None:
            net = get_model_fn(cfg.dataset_name, cfg.network_type)

        if convbuilder is None:
            convbuilder = ConvBuilder()

        model = net(cfg, convbuilder).cuda()

        if train_dataloader is None:
            train_dataloader = create_dataset(cfg.dataset_name,
                                              cfg.dataset_subset,
                                              cfg.global_batch_size)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_dataloader = create_dataset(cfg.dataset_name,
                                            'val',
                                            batch_size=100)  #TODO 100?

        print('NOTE: Data prepared')
        print(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))

        # device = torch.device(cfg.device)
        # model.to(device)
        # model.cuda()

        optimizer = get_optimizer(cfg, model)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()

        # model, optimizer = amp.initialize(model, optimizer, opt_level="O0")

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer)

        if engine.distributed:
            print('Distributed training, engine.world_rank={}'.format(
                engine.world_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[engine.world_rank],
                broadcast_buffers=False,
            )
            # model = DistributedDataParallel(model, delay_allreduce=True)

        if engine.continue_state_object:
            engine.restore_checkpoint()
        else:
            if cfg.init_weights:
                engine.load_checkpoint(cfg.init_weights, is_restore=False)

        if show_variables:
            engine.show_variables()

        # ------------ do training ---------------------------- #
        logger.info("\n\nStart training with pytorch version {}".format(
            torch.__version__))

        iteration = engine.state.iteration
        # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name)
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch

        for epoch in range(done_epochs, cfg.max_epochs):

            pbar = tqdm(range(iters_per_epoch))
            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)

            if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0:
                model.eval()
                val_iters = 500 if cfg.dataset_name == 'imagenet' else 100  # use batch_size=100 for val on ImagenNet and CIFAR
                eval_dict = run_eval(val_dataloader,
                                     val_iters,
                                     model,
                                     criterion,
                                     discrip_str,
                                     dataset_name=cfg.dataset_name)
                val_top1_value = eval_dict['top1'].item()
                val_top5_value = eval_dict['top5'].item()
                val_loss_value = eval_dict['loss'].item()
                for tag, value in zip(
                        tb_tags,
                    [val_top1_value, val_top5_value, val_loss_value]):
                    tb_writer.add_scalars(tag, {'Val': value}, iteration)
                engine.log(
                    'validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}'
                    .format(epoch, val_top1_value, val_top5_value,
                            val_loss_value))
                model.train()

            for _ in pbar:

                scheduler.step()

                start_time = time.time()
                data, label = load_cuda_data(train_dataloader,
                                             cfg.dataset_name)
                data_time = time.time() - start_time

                if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0)
                acc, acc5, loss = train_one_step(model, data, label, optimizer,
                                                 criterion, if_accum_grad)

                if iteration % cfg.tb_iter_period == 0 and is_main_process:
                    for tag, value in zip(
                            tb_tags,
                        [acc.item(), acc5.item(),
                         loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)

                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed
                                                    and is_main_process):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                iteration += 1
                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            if iteration >= max_iters:
                break
        #   do something after the training
        engine.save_checkpoint(cfg.save_weights)
        print('NOTE: training finished, saved to {}'.format(cfg.save_weights))
Ejemplo n.º 11
0
def run_eval(ds_val, max_iters, net, criterion, discrip_str, dataset_name):
    pbar = tqdm(range(max_iters))
    top1 = AvgMeter()
    top5 = AvgMeter()
    losses = AvgMeter()
    pbar.set_description('Validation' + discrip_str)
    total_net_time = 0
    with torch.no_grad():
        for iter_idx, i in enumerate(pbar):
            start_time = time.time()
            data, label = load_cuda_data(ds_val, dataset_name=dataset_name)
            data_time = time.time() - start_time

            net_time_start = time.time()
            pred = net(data)
            net_time_end = time.time()

            if iter_idx >= SPEED_TEST_SAMPLE_IGNORE_RATIO * max_iters:
                total_net_time += net_time_end - net_time_start

            loss = criterion(pred, label)
            acc, acc5 = torch_accuracy(pred, label, (1, 5))

            top1.update(acc.item())
            top5.update(acc5.item())
            losses.update(loss.item())
            pbar_dic = OrderedDict()
            pbar_dic['data-time'] = '{:.2f}'.format(data_time)
            pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
            pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
            pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
            pbar.set_postfix(pbar_dic)

    metric_dic = {'top1':torch.tensor(top1.mean),
                  'top5':torch.tensor(top5.mean),
                  'loss':torch.tensor(losses.mean)}
    reduced_metirc_dic = reduce_loss_dict(metric_dic)
    # reduced_metirc_dic = my_reduce_dic(metric_dic)
    return reduced_metirc_dic, total_net_time
Ejemplo n.º 12
0
def csgd_train_main(local_rank,
                    cfg: BaseConfigByEpoch,
                    target_deps,
                    succeeding_strategy,
                    pacesetter_dict,
                    centri_strength,
                    pruned_weights,
                    net=None,
                    train_dataloader=None,
                    val_dataloader=None,
                    show_variables=False,
                    convbuilder=None,
                    init_hdf5=None,
                    no_l2_keywords='depth',
                    use_nesterov=False,
                    load_weights_keyword=None,
                    keyword_to_lr_mult=None,
                    auto_continue=False,
                    save_hdf5_epochs=10000):

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy')

    with Engine(local_rank=local_rank) as engine:
        engine.setup_log(name='train',
                         log_dir=cfg.output_dir,
                         file_name='log.txt')

        # ----------------------------- build model ------------------------------
        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)
        if net is None:
            net_fn = get_model_fn(cfg.dataset_name, cfg.network_type)
            model = net_fn(cfg, convbuilder)
        else:
            model = net
        model = model.cuda()
        # ----------------------------- model done ------------------------------

        # ---------------------------- prepare data -------------------------
        if train_dataloader is None:
            train_data = create_dataset(cfg.dataset_name,
                                        cfg.dataset_subset,
                                        cfg.global_batch_size,
                                        distributed=engine.distributed)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_data = create_dataset(cfg.dataset_name,
                                      'val',
                                      global_batch_size=100,
                                      distributed=False)
        engine.echo('NOTE: Data prepared')
        engine.echo(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))
        # ----------------------------- data done --------------------------------

        # ------------------------ parepare optimizer, scheduler, criterion -------
        if no_l2_keywords is None:
            no_l2_keywords = []
        if type(no_l2_keywords) is not list:
            no_l2_keywords = [no_l2_keywords]
        # For a target parameter, cancel its weight decay in optimizer, because the weight decay will be later encoded in the decay mat
        conv_idx = 0
        for k, v in model.named_parameters():
            if v.dim() != 4:
                continue
            print('prune {} from {} to {}'.format(conv_idx,
                                                  target_deps[conv_idx],
                                                  cfg.deps[conv_idx]))
            if target_deps[conv_idx] < cfg.deps[conv_idx]:
                no_l2_keywords.append(k.replace(KERNEL_KEYWORD, 'conv'))
                no_l2_keywords.append(k.replace(KERNEL_KEYWORD, 'bn'))
            conv_idx += 1
        print('no l2: ', no_l2_keywords)
        optimizer = get_optimizer(engine,
                                  cfg,
                                  model,
                                  no_l2_keywords=no_l2_keywords,
                                  use_nesterov=use_nesterov,
                                  keyword_to_lr_mult=keyword_to_lr_mult)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()
        # --------------------------------- done -------------------------------

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer)

        if engine.distributed:
            torch.cuda.set_device(local_rank)
            engine.echo('Distributed training, device {}'.format(local_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                broadcast_buffers=False,
            )
        else:
            assert torch.cuda.device_count() == 1
            engine.echo('Single GPU training')

        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights)
        if init_hdf5:
            engine.load_hdf5(init_hdf5,
                             load_weights_keyword=load_weights_keyword)
        if auto_continue:
            assert cfg.init_weights is None
            engine.load_checkpoint(get_last_checkpoint(cfg.output_dir))
        if show_variables:
            engine.show_variables()

        #   ===================================== prepare the clusters and matrices for C-SGD ==========
        kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list(
        )

        if os.path.exists(clusters_save_path):
            layer_idx_to_clusters = np.load(clusters_save_path,
                                            allow_pickle=True).item()
        else:
            if local_rank == 0:
                layer_idx_to_clusters = get_layer_idx_to_clusters(
                    kernel_namedvalue_list=kernel_namedvalue_list,
                    target_deps=target_deps,
                    pacesetter_dict=pacesetter_dict)
                if pacesetter_dict is not None:
                    for follower_idx, pacesetter_idx in pacesetter_dict.items(
                    ):
                        if pacesetter_idx in layer_idx_to_clusters:
                            layer_idx_to_clusters[
                                follower_idx] = layer_idx_to_clusters[
                                    pacesetter_idx]

                np.save(clusters_save_path, layer_idx_to_clusters)
            else:
                while not os.path.exists(clusters_save_path):
                    time.sleep(10)
                    print('sleep, waiting for process 0 to calculate clusters')
                layer_idx_to_clusters = np.load(clusters_save_path,
                                                allow_pickle=True).item()

        param_name_to_merge_matrix = generate_merge_matrix_for_kernel(
            deps=cfg.deps,
            layer_idx_to_clusters=layer_idx_to_clusters,
            kernel_namedvalue_list=kernel_namedvalue_list)
        add_vecs_to_merge_mat_dicts(param_name_to_merge_matrix)
        param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs(
            deps=cfg.deps,
            layer_idx_to_clusters=layer_idx_to_clusters,
            kernel_namedvalue_list=kernel_namedvalue_list,
            weight_decay=cfg.weight_decay,
            weight_decay_bias=cfg.weight_decay_bias,
            centri_strength=centri_strength)
        print(param_name_to_decay_matrix.keys())
        print(param_name_to_merge_matrix.keys())

        conv_idx = 0
        param_to_clusters = {}
        for k, v in model.named_parameters():
            if v.dim() != 4:
                continue
            if conv_idx in layer_idx_to_clusters:
                for clsts in layer_idx_to_clusters[conv_idx]:
                    if len(clsts) > 1:
                        param_to_clusters[v] = layer_idx_to_clusters[conv_idx]
                        break
            conv_idx += 1
        #   ============================================================================================

        # ------------ do training ---------------------------- #
        engine.log("\n\nStart training with pytorch version {}".format(
            torch.__version__))

        iteration = engine.state.iteration
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch
        last_epoch_done_iters = iteration % iters_per_epoch

        if done_epochs == 0 and last_epoch_done_iters == 0:
            engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

        recorded_train_time = 0
        recorded_train_examples = 0

        collected_train_loss_sum = 0
        collected_train_loss_count = 0

        for epoch in range(done_epochs, cfg.max_epochs):

            if engine.distributed and hasattr(train_data, 'train_sampler'):
                train_data.train_sampler.set_epoch(epoch)

            if epoch == done_epochs:
                pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters))
            else:
                pbar = tqdm(range(iters_per_epoch))

            if epoch == 0 and local_rank == 0:
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str='Init',
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)

            for _ in pbar:

                start_time = time.time()
                data, label = load_cuda_data(train_data,
                                             dataset_name=cfg.dataset_name)

                # load_cuda_data(train_dataloader, cfg.dataset_name)
                data_time = time.time() - start_time

                train_net_time_start = time.time()
                acc, acc5, loss = train_one_step(
                    model,
                    data,
                    label,
                    optimizer,
                    criterion,
                    param_name_to_merge_matrix=param_name_to_merge_matrix,
                    param_name_to_decay_matrix=param_name_to_decay_matrix)
                train_net_time_end = time.time()

                if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                    recorded_train_examples += cfg.global_batch_size
                    recorded_train_time += train_net_time_end - train_net_time_start

                scheduler.step()

                for module in model.modules():
                    if hasattr(module, 'set_cur_iter'):
                        module.set_cur_iter(iteration)

                if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0:
                    for tag, value in zip(
                            tb_tags,
                        [acc.item(), acc5.item(),
                         loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)
                    deviation_sum = 0
                    for param, clusters in param_to_clusters.items():
                        pvalue = param.detach().cpu().numpy()
                        for cl in clusters:
                            if len(cl) == 1:
                                continue
                            selected = pvalue[cl, :, :, :]
                            mean_kernel = np.mean(selected,
                                                  axis=0,
                                                  keepdims=True)
                            diff = selected - mean_kernel
                            deviation_sum += np.sum(diff**2)
                    tb_writer.add_scalars('deviation_sum',
                                          {'Train': deviation_sum}, iteration)

                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS:
                    collected_train_loss_sum += loss.item()
                    collected_train_loss_count += 1

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                iteration += 1

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed and
                                                    engine.world_rank == 0):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            engine.update_iteration(iteration)
            engine.save_latest_ckpt(cfg.output_dir)

            if (epoch + 1) % save_hdf5_epochs == 0:
                engine.save_hdf5(
                    os.path.join(cfg.output_dir,
                                 'epoch-{}.hdf5'.format(epoch)))

            if local_rank == 0 and \
                    cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0):
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str=discrip_str,
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            if iteration >= max_iters:
                break

        #   do something after the training
        if recorded_train_time > 0:
            exp_per_sec = recorded_train_examples / recorded_train_time
        else:
            exp_per_sec = 0
        engine.log(
            'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
            .format(int(TRAIN_SPEED_START * max_iters),
                    int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size,
                    recorded_train_examples, recorded_train_time, exp_per_sec))
        if cfg.save_weights:
            engine.save_checkpoint(cfg.save_weights)
            print('NOTE: training finished, saved to {}'.format(
                cfg.save_weights))
        engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
        if collected_train_loss_count > 0:
            engine.log(
                'TRAIN LOSS collected over last {} epochs: {:.6f}'.format(
                    COLLECT_TRAIN_LOSS_EPOCHS,
                    collected_train_loss_sum / collected_train_loss_count))

    if local_rank == 0:
        csgd_prune_and_save(engine=engine,
                            layer_idx_to_clusters=layer_idx_to_clusters,
                            save_file=pruned_weights,
                            succeeding_strategy=succeeding_strategy,
                            new_deps=target_deps)
Ejemplo n.º 13
0
def aofp_train_main(local_rank,
                    target_layers,
                    succ_strategy,
                    warmup_iterations,
                    aofp_batches_per_half,
                    flops_func,
                    cfg: BaseConfigByEpoch,
                    net=None,
                    train_dataloader=None,
                    val_dataloader=None,
                    show_variables=False,
                    convbuilder=None,
                    init_hdf5=None,
                    no_l2_keywords='depth',
                    gradient_mask=None,
                    use_nesterov=False,
                    tensorflow_style_init=False,
                    keyword_to_lr_mult=None,
                    auto_continue=False,
                    lasso_keyword_to_strength=None,
                    save_hdf5_epochs=10000,
                    remain_flops_ratio=0):

    if no_l2_keywords is None:
        no_l2_keywords = []
    if type(no_l2_keywords) is not list:
        no_l2_keywords = [no_l2_keywords]

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    with Engine(local_rank=local_rank) as engine:
        engine.setup_log(name='train',
                         log_dir=cfg.output_dir,
                         file_name='log.txt')

        # ----------------------------- build model ------------------------------
        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)
        if net is None:
            net_fn = get_model_fn(cfg.dataset_name, cfg.network_type)
            model = net_fn(cfg, convbuilder)
        else:
            model = net
        model = model.cuda()
        # ----------------------------- model done ------------------------------

        # ---------------------------- prepare data -------------------------
        if train_dataloader is None:
            train_data = create_dataset(cfg.dataset_name,
                                        cfg.dataset_subset,
                                        cfg.global_batch_size,
                                        distributed=engine.distributed)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_data = create_dataset(cfg.dataset_name,
                                      'val',
                                      global_batch_size=100,
                                      distributed=False)
        engine.echo('NOTE: Data prepared')
        engine.echo(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))
        # ----------------------------- data done --------------------------------

        # ------------------------ parepare optimizer, scheduler, criterion -------
        optimizer = get_optimizer(engine,
                                  cfg,
                                  model,
                                  no_l2_keywords=no_l2_keywords,
                                  use_nesterov=use_nesterov,
                                  keyword_to_lr_mult=keyword_to_lr_mult)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()
        # --------------------------------- done -------------------------------

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer)

        if engine.distributed:
            torch.cuda.set_device(local_rank)
            engine.echo('Distributed training, device {}'.format(local_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                broadcast_buffers=False,
            )
        else:
            assert torch.cuda.device_count() == 1
            engine.echo('Single GPU training')

        if tensorflow_style_init:
            init_as_tensorflow(model)
        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights)
        if init_hdf5:
            engine.load_part('base_path.', init_hdf5)
        if auto_continue:
            assert cfg.init_weights is None
            engine.load_checkpoint(get_last_checkpoint(cfg.output_dir))
        if show_variables:
            engine.show_variables()

        # ------------ do training ---------------------------- #
        engine.log("\n\nStart training with pytorch version {}".format(
            torch.__version__))

        iteration = engine.state.iteration
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch
        last_epoch_done_iters = iteration % iters_per_epoch

        if done_epochs == 0 and last_epoch_done_iters == 0:
            engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

        recorded_train_time = 0
        recorded_train_examples = 0

        collected_train_loss_sum = 0
        collected_train_loss_count = 0

        if gradient_mask is not None:
            gradient_mask_tensor = {}
            for name, value in gradient_mask.items():
                gradient_mask_tensor[name] = torch.Tensor(value).cuda()
        else:
            gradient_mask_tensor = None

        #########################   aofp
        _init_interval = aofp_batches_per_half // len(target_layers)
        layer_to_start_iter = {
            i: (_init_interval * i + warmup_iterations)
            for i in target_layers
        }
        print(
            'the initial layer_to_start_iter = {}'.format(layer_to_start_iter))
        #   0.  get all the AOFPLayers
        layer_idx_to_module = {}
        for submodule in model.modules():
            if hasattr(submodule, 'score_mask') or hasattr(
                    submodule, 't_value'):
                layer_idx_to_module[submodule.conv_idx] = submodule
        print(layer_idx_to_module)
        ######################################

        for epoch in range(done_epochs, cfg.max_epochs):

            if engine.distributed and hasattr(train_data, 'train_sampler'):
                train_data.train_sampler.set_epoch(epoch)

            if epoch == done_epochs:
                pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters))
            else:
                pbar = tqdm(range(iters_per_epoch))

            if epoch == 0 and local_rank == 0:
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str='Init',
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)

            for _ in pbar:

                start_time = time.time()
                data, label = load_cuda_data(train_data,
                                             dataset_name=cfg.dataset_name)

                # load_cuda_data(train_dataloader, cfg.dataset_name)
                data_time = time.time() - start_time

                if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0)

                train_net_time_start = time.time()

                ############    aofp
                #   1.  see if it is time to start on every layer
                #   2.  forward and accumulate
                #   3.  if a half on some layer is finished, do something
                #   ----    fetch its accumulated t vectors, analyze the first 'granu' elements
                #   ----    if good enough, set the base mask, reset the search space
                #   ----    elif granu == 1, do nothing
                #   ----    else, granu /= 2, reset the search space
                for layer_idx, start_iter in layer_to_start_iter.items():
                    if start_iter == iteration:
                        layer_idx_to_module[layer_idx].start_aofp(iteration)
                acc, acc5, loss = train_one_step(
                    model,
                    data,
                    label,
                    optimizer,
                    criterion,
                    if_accum_grad,
                    gradient_mask_tensor=gradient_mask_tensor,
                    lasso_keyword_to_strength=lasso_keyword_to_strength)
                for layer_idx, aofp_layer in layer_idx_to_module.items():
                    #   accumulate
                    if layer_idx not in succ_strategy:
                        continue
                    follow_layer_idx = succ_strategy[layer_idx]
                    if follow_layer_idx not in layer_idx_to_module:
                        continue
                    t_value = layer_idx_to_module[follow_layer_idx].t_value
                    aofp_layer.accumulate_t_value(t_value)
                    if aofp_layer.finished_a_half(iteration):
                        aofp_layer.halve_or_stop(iteration)
                ###################################

                train_net_time_end = time.time()

                if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                    recorded_train_examples += cfg.global_batch_size
                    recorded_train_time += train_net_time_end - train_net_time_start

                scheduler.step()

                for module in model.modules():
                    if hasattr(module, 'set_cur_iter'):
                        module.set_cur_iter(iteration)

                if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0:
                    for tag, value in zip(
                            tb_tags,
                        [acc.item(), acc5.item(),
                         loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)

                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS:
                    collected_train_loss_sum += loss.item()
                    collected_train_loss_count += 1

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                iteration += 1

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed and
                                                    engine.world_rank == 0):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            engine.update_iteration(iteration)
            engine.save_latest_ckpt(cfg.output_dir)

            if (epoch + 1) % save_hdf5_epochs == 0:
                engine.save_hdf5(
                    os.path.join(cfg.output_dir,
                                 'epoch-{}.hdf5'.format(epoch)))

            if local_rank == 0 and \
                    cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0):
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str=discrip_str,
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            cur_deps = np.array(cfg.deps)
            for submodule in model.modules():
                if hasattr(submodule, 'base_mask'):
                    cur_deps[submodule.conv_idx] = np.sum(
                        submodule.base_mask.cpu().numpy() == 1)
            origin_flops = flops_func(cfg.deps)
            cur_flops = flops_func(cur_deps)
            remain_ratio = cur_flops / origin_flops
            if local_rank == 0:
                print('##########################')
                print('origin deps ', cfg.deps)
                print('cur deps ', cur_deps)
                print('remain flops ratio = ', remain_ratio, 'the target is ',
                      remain_flops_ratio)
                print('##########################')
            if remain_ratio < remain_flops_ratio:
                break
            if iteration >= max_iters:
                break

        #   do something after the training
        if recorded_train_time > 0:
            exp_per_sec = recorded_train_examples / recorded_train_time
        else:
            exp_per_sec = 0
        engine.log(
            'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
            .format(int(TRAIN_SPEED_START * max_iters),
                    int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size,
                    recorded_train_examples, recorded_train_time, exp_per_sec))
        if cfg.save_weights:
            engine.save_checkpoint(cfg.save_weights)
            print('NOTE: training finished, saved to {}'.format(
                cfg.save_weights))
        engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
        if collected_train_loss_count > 0:
            engine.log(
                'TRAIN LOSS collected over last {} epochs: {:.6f}'.format(
                    COLLECT_TRAIN_LOSS_EPOCHS,
                    collected_train_loss_sum / collected_train_loss_count))

        final_deps = aofp_prune(model,
                                origin_deps=cfg.deps,
                                succ_strategy=succ_strategy,
                                save_path=os.path.join(cfg.output_dir,
                                                       'finish_pruned.hdf5'))
        origin_flops = flops_func(cfg.deps)
        cur_flops = flops_func(final_deps)
        engine.log(
            '##################################################################'
        )
        engine.log(cfg.network_type)
        engine.log('origin width: {} , flops {} '.format(
            cfg.deps, origin_flops))
        engine.log('final width: {}, flops {} '.format(final_deps, cur_flops))
        engine.log('flops reduction: {}'.format(1 - cur_flops / origin_flops))
        return final_deps
Ejemplo n.º 14
0
def ding_train(cfg:BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, beginning_msg=None,
               init_hdf5=None, no_l2_keywords=None, gradient_mask=None, use_nesterov=False):

    # LOCAL_RANK = 0
    #
    # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    # is_distributed = num_gpus > 1
    #
    # if is_distributed:
    #     torch.cuda.set_device(LOCAL_RANK)
    #     torch.distributed.init_process_group(
    #         backend="nccl", init_method="env://"
    #     )
    #     synchronize()
    #
    # torch.backends.cudnn.benchmark = True

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    with Engine() as engine:

        is_main_process = (engine.world_rank == 0) #TODO correct?

        logger = engine.setup_log(
            name='train', log_dir=cfg.output_dir, file_name='log.txt')

        # -- typical model components model, opt,  scheduler,  dataloder --#
        if net is None:
            net = get_model_fn(cfg.dataset_name, cfg.network_type)

        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)

        model = net(cfg, convbuilder).cuda()

        if train_dataloader is None:
            train_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_dataloader = create_dataset(cfg.dataset_name, 'val', batch_size=100)    #TODO 100?

        print('NOTE: Data prepared')
        print('NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'.format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated()))

        # device = torch.device(cfg.device)
        # model.to(device)
        # model.cuda()

        if no_l2_keywords is None:
            no_l2_keywords = []
        optimizer = get_optimizer(cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()

        # model, optimizer = amp.initialize(model, optimizer, opt_level="O0")

        engine.register_state(
            scheduler=scheduler, model=model, optimizer=optimizer)

        if engine.distributed:
            print('Distributed training, engine.world_rank={}'.format(engine.world_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[engine.world_rank],
                broadcast_buffers=False, )
            # model = DistributedDataParallel(model, delay_allreduce=True)
        elif torch.cuda.device_count() > 1:
            print('Single machine multiple GPU training')
            model = torch.nn.parallel.DataParallel(model)

        # for k, v in model.named_parameters():
        #     if v.dim() in [2, 4]:
        #         torch.nn.init.xavier_normal_(v)
        #         print('init {} as xavier_normal'.format(k))
        #     if 'bias' in k and 'bn' not in k.lower():
        #         torch.nn.init.zeros_(v)
        #         print('init {} as zero'.format(k))

        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights, is_restore=True)

        if init_hdf5:
            engine.load_hdf5(init_hdf5)


        if show_variables:
            engine.show_variables()

        # ------------ do training ---------------------------- #
        if beginning_msg:
            engine.log(beginning_msg)
        logger.info("\n\nStart training with pytorch version {}".format(torch.__version__))

        iteration = engine.state.iteration
        # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name)
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch

        engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

        # summary(model=model, input_size=(224, 224) if cfg.dataset_name == 'imagenet' else (32, 32), batch_size=cfg.global_batch_size)

        recorded_train_time = 0
        recorded_train_examples = 0

        if gradient_mask is not None:
            gradient_mask_tensor = {}
            for name, value in gradient_mask.items():
                gradient_mask_tensor[name] = torch.Tensor(value).cuda()
        else:
            gradient_mask_tensor = None

        for epoch in range(done_epochs, cfg.max_epochs):

            pbar = tqdm(range(iters_per_epoch))
            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)


            if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0:
                model.eval()
                val_iters = 500 if cfg.dataset_name == 'imagenet' else 100  # use batch_size=100 for val on ImagenNet and CIFAR
                eval_dict, _ = run_eval(val_dataloader, val_iters, model, criterion, discrip_str, dataset_name=cfg.dataset_name)
                val_top1_value = eval_dict['top1'].item()
                val_top5_value = eval_dict['top5'].item()
                val_loss_value = eval_dict['loss'].item()
                for tag, value in zip(tb_tags, [val_top1_value, val_top5_value, val_loss_value]):
                    tb_writer.add_scalars(tag, {'Val': value}, iteration)
                engine.log('validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}'.format(epoch, val_top1_value, val_top5_value, val_loss_value))
                model.train()

            for _ in pbar:

                start_time = time.time()
                data, label = load_cuda_data(train_dataloader, cfg.dataset_name)
                data_time = time.time() - start_time

                if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0)

                train_net_time_start = time.time()
                acc, acc5, loss = train_one_step(model, data, label, optimizer, criterion, if_accum_grad, gradient_mask_tensor=gradient_mask_tensor)
                train_net_time_end = time.time()

                if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                    recorded_train_examples += cfg.global_batch_size
                    recorded_train_time += train_net_time_end - train_net_time_start

                scheduler.step()

                if iteration % cfg.tb_iter_period == 0 and is_main_process:
                    for tag, value in zip(tb_tags, [acc.item(), acc5.item(), loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)


                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed and is_main_process):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                iteration += 1
                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            if iteration >= max_iters:
                break
        #   do something after the training
        if recorded_train_time > 0:
            exp_per_sec = recorded_train_examples / recorded_train_time
        else:
            exp_per_sec = 0
        engine.log(
            'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
            .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size,
                    recorded_train_examples, recorded_train_time, exp_per_sec))
        if cfg.save_weights:
            engine.save_checkpoint(cfg.save_weights)
            print('NOTE: training finished, saved to {}'.format(cfg.save_weights))
        engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
Ejemplo n.º 15
0
    def _test_process(self, save_pre):
        loader = self.te_loader

        # pres = [AvgMeter() for _ in range(256)]
        # recs = [AvgMeter() for _ in range(256)]
        pres = list()
        recs = list()

        meanfs = AvgMeter()
        maes = AvgMeter()

        # Measures from Saliency toolbox
        measures = [
            'Wgt-F', 'E-measure', 'S-measure', 'Mod-Max-F', 'Mod-Adp-F',
            'Mod-Wgt-F'
        ]
        beta = np.sqrt(
            0.3)  # default beta parameter used in the adaptive F-measure
        gt_threshold = 0.5  # The threshold that is used to binrize ground truth maps.

        values = dict()  # initialize measure value dictionary
        pr = dict()  # initialize precision recall dictionary
        prm = dict()  # initialize precision recall dictionary for Mod-Max-F
        for idx in measures:
            values[idx] = list()
            if idx == 'Max-F':
                pr['Precision'] = list()
                pr['Recall'] = list()
            if idx == 'Mod-Max-F':
                prm['Precision'] = list()
                prm['Recall'] = list()

        tqdm_iter = tqdm(enumerate(loader), total=len(loader), leave=False)
        for test_batch_id, test_data in tqdm_iter:
            tqdm_iter.set_description(
                f"{self.exp_name}: te=>{test_batch_id + 1}")
            in_imgs, in_mask_paths, in_names = test_data

            generate_out_imgs = False
            if self.arg_dict["resume_mode"] == "measure":
                # Check if prediction masks have already been created
                for item_id, in_fname in enumerate(in_names):
                    oimg_path = os.path.join(self.save_path, in_fname + ".png")
                    if not os.path.exists(oimg_path):
                        # Out image doesn't exist yet
                        generate_out_imgs = True
                        break
            else:
                generate_out_imgs = True

            if generate_out_imgs:
                with torch.no_grad():
                    in_imgs = in_imgs.to(self.dev, non_blocking=True)
                    outputs = self.net(in_imgs)

                outputs_np = outputs.sigmoid().cpu().detach()

            for item_id, in_fname in enumerate(in_names):
                oimg_path = os.path.join(self.save_path, in_fname + ".png")
                gimg_path = os.path.join(in_mask_paths[item_id])
                gt_img = Image.open(gimg_path).convert("L")

                if self.arg_dict[
                        "resume_mode"] == "measure" and generate_out_imgs == False:
                    out_img = Image.open(oimg_path).convert("L")
                else:
                    out_item = outputs_np[item_id]
                    out_img = self.to_pil(out_item).resize(
                        gt_img.size, resample=Image.NEAREST)

                if save_pre and generate_out_imgs:
                    out_img.save(oimg_path)

                gt_img = np.array(gt_img)
                out_img = np.array(out_img)

                # Gather images again using Saliency toolboxes import methods
                # These images will be grayscale floats between 0 and 1
                sm = out_img.astype(np.float32)
                if sm.max() == sm.min():
                    sm = sm / 255
                else:
                    sm = (sm - sm.min()) / (sm.max() - sm.min())
                gt = np.zeros_like(gt_img, dtype=np.float32)
                gt[gt_img > 256 * gt_threshold] = 1

                ps, rs, mae, meanf = cal_pr_mae_meanf(out_img, gt_img)
                pres.append(ps)
                recs.append(rs)
                # for pidx, pdata in enumerate(zip(ps, rs)):
                #     p, r = pdata
                #     pres[pidx].update(p)
                #     recs[pidx].update(r)
                maes.update(mae)
                meanfs.update(meanf)

                # Compute other measures using the Saliency Toolbox
                if 'MAE2' in measures:
                    values['MAE2'].append(mean_square_error(gt, sm))
                if 'E-measure' in measures:
                    values['E-measure'].append(e_measure(gt, sm))
                if 'S-measure' in measures:
                    values['S-measure'].append(s_measure(gt, sm))
                if 'Adp-F' in measures:
                    values['Adp-F'].append(
                        adaptive_fmeasure(gt, sm, beta, allowBlackMask=False))
                if 'Mod-Adp-F' in measures:
                    values['Mod-Adp-F'].append(
                        adaptive_fmeasure(gt, sm, beta, allowBlackMask=True))
                if 'Wgt-F' in measures:
                    values['Wgt-F'].append(
                        weighted_fmeasure(gt, sm, allowBlackMask=False))
                if 'Mod-Wgt-F' in measures:
                    values['Mod-Wgt-F'].append(
                        weighted_fmeasure(gt, sm, allowBlackMask=True))
                if 'Max-F' in measures:
                    prec, recall = prec_recall(
                        gt, sm, 256,
                        allowBlackMask=False)  # 256 thresholds between 0 and 1

                    # Check if precision recall curve exists
                    if len(prec) != 0 and len(recall) != 0:
                        pr['Precision'].append(prec)
                        pr['Recall'].append(recall)
                if 'Mod-Max-F' in measures:
                    prec, recall = prec_recall(
                        gt, sm, 256,
                        allowBlackMask=True)  # 256 thresholds between 0 and 1

                    # Check if precision recall curve exists
                    if len(prec) != 0 and len(recall) != 0:
                        prm['Precision'].append(prec)
                        prm['Recall'].append(recall)

        # Compute total measures over all images
        if 'MAE2' in measures:
            values['MAE2'] = np.mean(values['MAE2'])

        if 'E-measure' in measures:
            values['E-measure'] = np.mean(values['E-measure'])

        if 'S-measure' in measures:
            values['S-measure'] = np.mean(values['S-measure'])

        if 'Adp-F' in measures:
            values['Adp-F'] = np.mean(values['Adp-F'])
        if 'Mod-Adp-F' in measures:
            values['Mod-Adp-F'] = np.mean(values['Mod-Adp-F'])

        if 'Wgt-F' in measures:
            values['Wgt-F'] = np.mean(values['Wgt-F'])
        if 'Mod-Wgt-F' in measures:
            values['Mod-Wgt-F'] = np.mean(values['Mod-Wgt-F'])

        if 'Max-F' in measures:
            if len(pr['Precision']) > 0:
                pr['Precision'] = np.mean(np.hstack(pr['Precision'][:]), 1)
                pr['Recall'] = np.mean(np.hstack(pr['Recall'][:]), 1)
                f_measures = (1 + beta**2) * pr['Precision'] * pr['Recall'] / (
                    beta**2 * pr['Precision'] + pr['Recall'])

                # Remove any NaN values to allow calculation
                f_measures[np.isnan(f_measures)] = 0
                values['Max-F'] = np.max(f_measures)
            else:
                # There were likely no images found in the directory, so pr['Precision']
                # is an empty set
                values['Max-F'] = 0
        if 'Mod-Max-F' in measures:
            if len(prm['Precision']) > 0:
                prm['Precision'] = np.mean(np.hstack(prm['Precision'][:]), 1)
                prm['Recall'] = np.mean(np.hstack(prm['Recall'][:]), 1)
                f_measures = (1 +
                              beta**2) * prm['Precision'] * prm['Recall'] / (
                                  beta**2 * prm['Precision'] + prm['Recall'])

                # Remove any NaN values to allow calculation
                f_measures[np.isnan(f_measures)] = 0
                values['Mod-Max-F'] = np.max(f_measures)
            else:
                # There were likely no images found in the directory, so prm['Precision']
                # is an empty set
                values['Mod-Max-F'] = 0

        # maxf = cal_maxf([pre.avg for pre in pres], [rec.avg for rec in recs])

        # Calculate MAXF using original algorithm pr, re curves
        pres = np.mean(np.hstack(pres[:]), 1)
        recs = np.mean(np.hstack(recs[:]), 1)
        f_measures = (1 + beta**2) * pres * recs / (beta**2 * pres + recs)
        # Remove any NaN values to allow calculation
        f_measures[np.isnan(f_measures)] = 0
        maxf = np.max(f_measures)

        results = {
            "MAXF": maxf,
            "MEANF": meanfs.avg,
            "MAE": maes.avg,
            **values
        }
        return results
Ejemplo n.º 16
0
def csgd_train_and_prune(cfg: BaseConfigByEpoch,
                         target_deps,
                         centri_strength,
                         pacesetter_dict,
                         succeeding_strategy,
                         pruned_weights,
                         net=None,
                         train_dataloader=None,
                         val_dataloader=None,
                         show_variables=False,
                         convbuilder=None,
                         beginning_msg=None,
                         init_hdf5=None,
                         no_l2_keywords=None,
                         use_nesterov=False,
                         tensorflow_style_init=False):

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy')

    with Engine() as engine:

        is_main_process = (engine.world_rank == 0)  #TODO correct?

        logger = engine.setup_log(name='train',
                                  log_dir=cfg.output_dir,
                                  file_name='log.txt')

        # -- typical model components model, opt,  scheduler,  dataloder --#
        if net is None:
            net = get_model_fn(cfg.dataset_name, cfg.network_type)

        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)

        model = net(cfg, convbuilder).cuda()

        if train_dataloader is None:
            train_dataloader = create_dataset(cfg.dataset_name,
                                              cfg.dataset_subset,
                                              cfg.global_batch_size)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_dataloader = create_dataset(cfg.dataset_name,
                                            'val',
                                            batch_size=100)  #TODO 100?

        print('NOTE: Data prepared')
        print(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))

        optimizer = get_optimizer(cfg, model, use_nesterov=use_nesterov)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()

        # model, optimizer = amp.initialize(model, optimizer, opt_level="O0")

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer,
                              cfg=cfg)

        if engine.distributed:
            print('Distributed training, engine.world_rank={}'.format(
                engine.world_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[engine.world_rank],
                broadcast_buffers=False,
            )
            # model = DistributedDataParallel(model, delay_allreduce=True)
        elif torch.cuda.device_count() > 1:
            print('Single machine multiple GPU training')
            model = torch.nn.parallel.DataParallel(model)

        if tensorflow_style_init:
            for k, v in model.named_parameters():
                if v.dim() in [2, 4]:
                    torch.nn.init.xavier_uniform_(v)
                    print('init {} as xavier_uniform'.format(k))
                if 'bias' in k and 'bn' not in k.lower():
                    torch.nn.init.zeros_(v)
                    print('init {} as zero'.format(k))

        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights)

        if init_hdf5:
            engine.load_hdf5(init_hdf5)

        kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list(
        )

        if os.path.exists(clusters_save_path):
            layer_idx_to_clusters = np.load(clusters_save_path).item()
        else:
            layer_idx_to_clusters = get_layer_idx_to_clusters(
                kernel_namedvalue_list=kernel_namedvalue_list,
                target_deps=target_deps,
                pacesetter_dict=pacesetter_dict)
            if pacesetter_dict is not None:
                for follower_idx, pacesetter_idx in pacesetter_dict.items():
                    if pacesetter_idx in layer_idx_to_clusters:
                        layer_idx_to_clusters[
                            follower_idx] = layer_idx_to_clusters[
                                pacesetter_idx]

            np.save(clusters_save_path, layer_idx_to_clusters)

        csgd_save_file = os.path.join(cfg.output_dir, 'finish.hdf5')

        if os.path.exists(csgd_save_file):
            engine.load_hdf5(csgd_save_file)
        else:
            param_name_to_merge_matrix = generate_merge_matrix_for_kernel(
                deps=cfg.deps,
                layer_idx_to_clusters=layer_idx_to_clusters,
                kernel_namedvalue_list=kernel_namedvalue_list)
            param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs(
                deps=cfg.deps,
                layer_idx_to_clusters=layer_idx_to_clusters,
                kernel_namedvalue_list=kernel_namedvalue_list,
                weight_decay=cfg.weight_decay,
                centri_strength=centri_strength)
            # if pacesetter_dict is not None:
            #     for follower_idx, pacesetter_idx in pacesetter_dict.items():
            #         follower_kernel_name = kernel_namedvalue_list[follower_idx].name
            #         pacesetter_kernel_name = kernel_namedvalue_list[follower_idx].name
            #         if pacesetter_kernel_name in param_name_to_merge_matrix:
            #             param_name_to_merge_matrix[follower_kernel_name] = param_name_to_merge_matrix[
            #                 pacesetter_kernel_name]
            #             param_name_to_decay_matrix[follower_kernel_name] = param_name_to_decay_matrix[
            #                 pacesetter_kernel_name]

            add_vecs_to_mat_dicts(param_name_to_merge_matrix)

            if show_variables:
                engine.show_variables()

            if beginning_msg:
                engine.log(beginning_msg)

            logger.info("\n\nStart training with pytorch version {}".format(
                torch.__version__))

            iteration = engine.state.iteration
            # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name)
            iters_per_epoch = num_iters_per_epoch(cfg)
            max_iters = iters_per_epoch * cfg.max_epochs
            tb_writer = SummaryWriter(cfg.tb_dir)
            tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

            model.train()

            done_epochs = iteration // iters_per_epoch

            engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

            recorded_train_time = 0
            recorded_train_examples = 0

            for epoch in range(done_epochs, cfg.max_epochs):

                pbar = tqdm(range(iters_per_epoch))
                top1 = AvgMeter()
                top5 = AvgMeter()
                losses = AvgMeter()
                discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
                pbar.set_description('Train' + discrip_str)

                if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0:
                    model.eval()
                    val_iters = 500 if cfg.dataset_name == 'imagenet' else 100  # use batch_size=100 for val on ImagenNet and CIFAR
                    eval_dict, _ = run_eval(val_dataloader,
                                            val_iters,
                                            model,
                                            criterion,
                                            discrip_str,
                                            dataset_name=cfg.dataset_name)
                    val_top1_value = eval_dict['top1'].item()
                    val_top5_value = eval_dict['top5'].item()
                    val_loss_value = eval_dict['loss'].item()
                    for tag, value in zip(
                            tb_tags,
                        [val_top1_value, val_top5_value, val_loss_value]):
                        tb_writer.add_scalars(tag, {'Val': value}, iteration)
                    engine.log(
                        'validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}'
                        .format(epoch, val_top1_value, val_top5_value,
                                val_loss_value))
                    model.train()

                for _ in pbar:

                    start_time = time.time()
                    data, label = load_cuda_data(train_dataloader,
                                                 cfg.dataset_name)
                    data_time = time.time() - start_time

                    train_net_time_start = time.time()
                    acc, acc5, loss = train_one_step(
                        model,
                        data,
                        label,
                        optimizer,
                        criterion,
                        param_name_to_merge_matrix=param_name_to_merge_matrix,
                        param_name_to_decay_matrix=param_name_to_decay_matrix)
                    train_net_time_end = time.time()

                    if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                        recorded_train_examples += cfg.global_batch_size
                        recorded_train_time += train_net_time_end - train_net_time_start

                    scheduler.step()

                    if iteration % cfg.tb_iter_period == 0 and is_main_process:
                        for tag, value in zip(
                                tb_tags,
                            [acc.item(), acc5.item(),
                             loss.item()]):
                            tb_writer.add_scalars(tag, {'Train': value},
                                                  iteration)

                    top1.update(acc.item())
                    top5.update(acc5.item())
                    losses.update(loss.item())

                    pbar_dic = OrderedDict()
                    pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                    pbar_dic['cur_iter'] = iteration
                    pbar_dic['lr'] = scheduler.get_lr()[0]
                    pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                    pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                    pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                    pbar.set_postfix(pbar_dic)

                    if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                        engine.update_iteration(iteration)
                        if (not engine.distributed) or (engine.distributed
                                                        and is_main_process):
                            engine.save_and_link_checkpoint(cfg.output_dir)

                    iteration += 1
                    if iteration >= max_iters:
                        break

                #   do something after an epoch?
                if iteration >= max_iters:
                    break
            #   do something after the training
            if recorded_train_time > 0:
                exp_per_sec = recorded_train_examples / recorded_train_time
            else:
                exp_per_sec = 0
            engine.log(
                'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
                .format(int(TRAIN_SPEED_START * max_iters),
                        int(TRAIN_SPEED_END * max_iters),
                        cfg.global_batch_size, recorded_train_examples,
                        recorded_train_time, exp_per_sec))
            if cfg.save_weights:
                engine.save_checkpoint(cfg.save_weights)
                print('NOTE: training finished, saved to {}'.format(
                    cfg.save_weights))
            engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))

        csgd_prune_and_save(engine=engine,
                            layer_idx_to_clusters=layer_idx_to_clusters,
                            save_file=pruned_weights,
                            succeeding_strategy=succeeding_strategy,
                            new_deps=target_deps)
Ejemplo n.º 17
0
def train_epoch_prefetch_generator(
    curr_epoch,
    end_epoch,
    loss_funcs,
    model,
    optimizer,
    scheduler,
    tr_loader,
    local_rank,
):
    model.train()
    train_loss_record = AvgMeter()

    for train_batch_id, (train_inputs, train_masks, train_names) in enumerate(
            BackgroundGenerator(tr_loader, max_prefetch=2)):
        curr_iter = curr_epoch * len(tr_loader) + train_batch_id
        if user_config["sche_usebatch"]:
            scheduler.step(optimizer, curr_epoch=curr_iter)

        train_inputs = train_inputs.cuda(non_blocking=True)
        train_masks = train_masks.cuda(non_blocking=True)
        train_preds = model(train_inputs)

        train_loss, loss_item_list = get_total_loss(train_preds, train_masks,
                                                    loss_funcs)

        optimizer.zero_grad()
        if user_config["use_amp"]:
            with amp.scale_loss(train_loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            train_loss.backward()
        optimizer.step()

        if user_config["is_distributed"]:
            reduced_loss = allreduce_tensor(train_loss)
        else:
            reduced_loss = train_loss
        train_iter_loss = reduced_loss.item()
        train_loss_record.update(train_iter_loss, train_inputs.size(0))

        if local_rank == 0:
            lr_str = ",".join([
                f"{param_groups['lr']:.7f}"
                for param_groups in optimizer.param_groups
            ])
            log = (
                f"[I:{train_batch_id}/{len(tr_loader)}/{curr_iter}/{total_iter_num}][E:{curr_epoch}:{end_epoch}]>["
                f"{exp_name}]"
                f"[Lr:{lr_str}][Avg:{train_loss_record.avg:.5f}|Cur:{train_iter_loss:.5f}|"
                f"{loss_item_list}]\n"
                f"{train_names}")
            if user_config["print_freq"] > 0 and (
                    curr_iter + 1) % user_config["print_freq"] == 0:
                print(log)
            if (user_config["record_freq"] > 0
                    and (curr_iter + 1) % user_config["record_freq"] == 0):
                tb_recorder.record_curve("trloss_avg", train_loss_record.avg,
                                         curr_iter)
                tb_recorder.record_curve("trloss_iter", train_loss_record.avg,
                                         curr_iter)
                tb_recorder.record_curve("lr", optimizer.param_groups,
                                         curr_iter)
                tb_recorder.record_image("trmasks", train_masks, curr_iter)
                tb_recorder.record_image("trsodout", train_preds.sigmoid(),
                                         curr_iter)
                tb_recorder.record_image("trsodin", train_inputs, curr_iter)
                write_data_to_file(log, path_config["tr_log"])
Ejemplo n.º 18
0
    def train(self):
        for curr_epoch in range(self.start_epoch, self.end_epoch):
            train_loss_record = AvgMeter()
            for train_batch_id, train_data in enumerate(self.tr_loader):
                curr_iter = curr_epoch * len(self.tr_loader) + train_batch_id

                self.opti.zero_grad()
                train_inputs, train_masks, *train_other_data = train_data
                train_inputs = train_inputs.to(self.dev, non_blocking=True)
                train_masks = train_masks.to(self.dev, non_blocking=True)
                train_preds = self.net(train_inputs)

                train_loss, loss_item_list = get_total_loss(
                    train_preds, train_masks, self.loss_funcs)
                train_loss.backward()
                self.opti.step()

                if self.args["sche_usebatch"]:
                    self.sche.step()

                # 仅在累计的时候使用item()获取数据
                train_iter_loss = train_loss.item()
                train_batch_size = train_inputs.size(0)
                train_loss_record.update(train_iter_loss, train_batch_size)

                # 显示tensorboard
                if self.args["tb_update"] > 0 and (
                        curr_iter + 1) % self.args["tb_update"] == 0:
                    self.tb.add_scalar("data/trloss_avg",
                                       train_loss_record.avg, curr_iter)
                    self.tb.add_scalar("data/trloss_iter", train_iter_loss,
                                       curr_iter)
                    for idx, param_groups in enumerate(self.opti.param_groups):
                        self.tb.add_scalar(f"data/lr_{idx}",
                                           param_groups["lr"], curr_iter)
                    tr_tb_mask = make_grid(train_masks,
                                           nrow=train_batch_size,
                                           padding=5)
                    self.tb.add_image("trmasks", tr_tb_mask, curr_iter)
                    tr_tb_out_1 = make_grid(train_preds,
                                            nrow=train_batch_size,
                                            padding=5)
                    self.tb.add_image("trsodout", tr_tb_out_1, curr_iter)

                # 记录每一次迭代的数据
                if self.args["print_freq"] > 0 and (
                        curr_iter + 1) % self.args["print_freq"] == 0:
                    lr_str = ",".join([
                        f"{param_groups['lr']:.7f}"
                        for param_groups in self.opti.param_groups
                    ])
                    log = (
                        f"[I:{curr_iter}/{self.iter_num}][E:{curr_epoch}:{self.end_epoch}]>"
                        f"[{self.exp_name}]"
                        f"[Lr:{lr_str}]"
                        f"[Avg:{train_loss_record.avg:.5f}|Cur:{train_iter_loss:.5f}|"
                        f"{loss_item_list}]")
                    print(log)
                    make_log(self.path["tr_log"], log)

            # 根据周期修改学习率
            if not self.args["sche_usebatch"]:
                self.sche.step()

            # 每个周期都进行保存测试,保存的是针对第curr_epoch+1周期的参数
            save_checkpoint(
                model=self.net,
                optimizer=self.opti,
                scheduler=self.sche,
                exp_name=self.exp_name,
                current_epoch=curr_epoch + 1,
                full_net_path=self.path["final_full_net"],
                state_net_path=self.path["final_state_net"],
            )  # 保存参数

        total_results = self.test()
        # save result into xlsx file.
        write_xlsx(self.exp_name, total_results)
Ejemplo n.º 19
0
def main():
    conf = Conf()
    args = parse(conf)
    device = conf.get_device()

    conf.suppress_random(set_determinism=args.set_determinism)
    saver = Saver(conf.log_path, args.exp_name)

    train_loader, query_loader, gallery_loader, queryimg_loader, galleryimg_loader = \
        get_dataloaders(args.dataset_name, conf.nas_path, device, args)

    num_pids = train_loader.dataset.get_num_pids()

    net = nn.DataParallel(get_model(args, num_pids))
    net = net.to(device)

    saver.write_logs(net.module, vars(args))

    opt = Adam(net.parameters(), lr=1e-4, weight_decay=args.wd)
    milestones = list(
        range(args.first_milestone, args.num_epochs, args.step_milestone))
    scheduler = lr_scheduler.MultiStepLR(opt,
                                         milestones=milestones,
                                         gamma=args.gamma)

    triplet_loss = OnlineTripletLoss('soft', True, reduction='mean').to(device)
    class_loss = nn.CrossEntropyLoss(reduction='mean').to(device)

    print("EXP_NAME: ", args.exp_name)

    for e in range(args.num_epochs):

        if e % args.eval_epoch_interval == 0 and e > 0:
            ev = Evaluator(net, query_loader, gallery_loader, queryimg_loader,
                           galleryimg_loader, DATA_CONFS[args.dataset_name],
                           device)
            ev.eval(saver, e, args.verbose)

        if e % args.save_epoch_interval == 0 and e > 0:
            saver.save_net(net.module, f'chk_{e // args.save_epoch_interval}')

        avm = AvgMeter(['triplet', 'class'])

        for it, (x, y, cams) in enumerate(train_loader):
            net.train()

            x, y = x.to(device), y.to(device)

            opt.zero_grad()
            embeddings, f_class = net(x, return_logits=True)

            triplet_loss_batch = triplet_loss(embeddings, y)
            class_loss_batch = class_loss(f_class, y)
            loss = triplet_loss_batch + class_loss_batch

            avm.add([triplet_loss_batch.item(), class_loss_batch.item()])

            loss.backward()
            opt.step()

        if e % args.print_epoch_interval == 0:
            stats = avm()
            str_ = f"Epoch: {e}"
            for (l, m) in stats:
                str_ += f" - {l} {m:.2f}"
                saver.dump_metric_tb(m, e, 'losses', f"avg_{l}")
            saver.dump_metric_tb(opt.param_groups[0]['lr'], e, 'lr', 'lr')
            print(str_)

        scheduler.step()

    ev = Evaluator(net, query_loader, gallery_loader, queryimg_loader,
                   galleryimg_loader, DATA_CONFS[args.dataset_name], device)
    ev.eval(saver, e, args.verbose)

    saver.save_net(net.module, 'chk_end')
    saver.writer.close()
Ejemplo n.º 20
0
def run_eval(val_data, max_iters, net, criterion, discrip_str, dataset_name):
    pbar = tqdm(
        range(max_iters)
    )  #Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息,用户只需要封装任意的迭代器 tqdm(iterator)。
    top1 = AvgMeter()  #实例化
    top5 = AvgMeter()
    losses = AvgMeter()
    pbar.set_description('Validation' + discrip_str)  #设置进度条左边显示的信息
    total_net_time = 0
    with torch.no_grad():
        for iter_idx, i in enumerate(pbar):
            start_time = time.time()

            data, label = load_cuda_data(val_data, dataset_name=dataset_name)
            data_time = time.time() - start_time

            net_time_start = time.time()
            pred = net(data)
            net_time_end = time.time()

            if iter_idx >= SPEED_TEST_SAMPLE_IGNORE_RATIO * max_iters:
                total_net_time += net_time_end - net_time_start

            loss = criterion(pred, label)
            acc, acc5 = torch_accuracy(pred, label, (1, 5))

            top1.update(acc.item())
            top5.update(acc5.item())
            losses.update(loss.item())
            pbar_dic = OrderedDict()
            pbar_dic['data-time'] = '{:.2f}'.format(data_time)
            pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
            pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
            pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
            pbar.set_postfix(pbar_dic)  #设置进度条右边显示的信息

    metric_dic = {
        'top1': torch.tensor(top1.mean),
        'top5': torch.tensor(top5.mean),
        'loss': torch.tensor(losses.mean)
    }
    # reduced_metirc_dic = reduce_loss_dict(metric_dic)
    reduced_metirc_dic = metric_dic  #TODO note this
    return reduced_metirc_dic, total_net_time  #{top1,top5,loss},网络运行时间
Ejemplo n.º 21
0
    def train(self):
        for curr_epoch in range(self.start_epoch, self.end_epoch):
            train_loss_record = AvgMeter()

            if self.args["lr_type"] == "poly":
                self.change_lr(curr_epoch)
            else:
                raise NotImplementedError

            for train_batch_id, train_data in enumerate(self.tr_loader):
                curr_iter = curr_epoch * len(self.tr_loader) + train_batch_id

                self.opti.zero_grad()
                train_inputs, train_masks, *train_other_data = train_data
                train_inputs = train_inputs.to(self.dev, non_blocking=True)
                train_masks = train_masks.to(self.dev, non_blocking=True)
                if self.data_mode == "RGBD":
                    # train_other_data是一个list
                    train_depths = train_other_data[-1]
                    train_depths = train_depths.to(self.dev, non_blocking=True)
                    train_preds = self.net(train_inputs, train_depths)
                elif self.data_mode == "RGB":
                    train_preds = self.net(train_inputs)
                else:
                    raise NotImplementedError

                train_loss, loss_item_list = self.total_loss(
                    train_preds, train_masks)
                train_loss.backward()
                self.opti.step()

                # 仅在累计的时候使用item()获取数据
                train_iter_loss = train_loss.item()
                train_batch_size = train_inputs.size(0)
                train_loss_record.update(train_iter_loss, train_batch_size)

                # 记录每一次迭代的数据
                if self.args["print_freq"] > 0 and (
                        curr_iter + 1) % self.args["print_freq"] == 0:
                    log = (
                        f"[I:{curr_iter}/{self.iter_num}][E:{curr_epoch}:{self.end_epoch}]>"
                        f"[{self.model_name}]"
                        f"[Lr:{self.opti.param_groups[0]['lr']:.7f}]"
                        f"[Avg:{train_loss_record.avg:.5f}|Cur:{train_iter_loss:.5f}|"
                        f"{loss_item_list}]")
                    print(log)
                    make_log(self.path["tr_log"], log)

            # 每个周期都进行保存测试,保存的是针对第curr_epoch+1周期的参数
            self.save_checkpoint(
                curr_epoch + 1,
                full_net_path=self.path["final_full_net"],
                state_net_path=self.path["final_state_net"],
            )

        # 进行最终的测试,首先输出验证结果
        print(f" ==>> 训练结束 <<== ")

        for data_name, data_path in self.te_data_list.items():
            print(f" ==>> 使用测试集{data_name}测试 <<== ")
            self.te_loader, self.te_length = create_loader(
                data_path=data_path,
                mode="test",
                get_length=True,
                data_mode=self.data_mode,
            )
            self.save_path = os.path.join(self.path["save"], data_name)
            if not os.path.exists(self.save_path):
                print(f" ==>> {self.save_path} 不存在, 这里创建一个 <<==")
                os.makedirs(self.save_path)
            results = self.test(save_pre=self.save_pre)
            fixed_pre_results = {k: f"{v:.3f}" for k, v in results.items()}
            msg = f" ==>> 在{data_name}:'{data_path}'测试集上结果\n >> {fixed_pre_results}"
            print(msg)
            make_log(self.path["te_log"], msg)