Example #1
0
 def on_epoch_end(self, epoch, logs=None):
     logs = logs or {}
     self.epochs_since_last_save += 1
     if self.epochs_since_last_save >= self.period:
         self.epochs_since_last_save = 0
         if self.save_best_only:
             current = logs.get(self.monitor)
             if current is None:
                 logging.warning(
                     'Can save best model only with %s available, '
                     'skipping.' % (self.monitor), RuntimeWarning)
             else:
                 if self.monitor_op(current, self.best):
                     if self.verbose > 0:
                         print(
                             'Epoch %05d: %s improved from %0.5f to %0.5f,'
                             ' saving model to %s' %
                             (epoch, self.monitor, self.best, current,
                              self.filepath))
                     self.best = current
                     save_model(self.model, self.optimizer, self.filepath)
                 else:
                     if self.verbose > 0:
                         print('Epoch %05d: %s did not improve' %
                               (epoch, self.monitor))
         else:
             if self.verbose > 0:
                 print('Epoch %05d: saving model to %s' %
                       (epoch, self.filepath))
                 save_model(self.model, self.optimizer, self.filepath)
    def train_loop(self, train_loader, valid_loader, logging, writer=None):
        best_error = float('inf')
        train_error_metric = train_obj = train_main_obj = train_ece = train_kl = None

        for epoch in range(self.args.epochs):
            if epoch >= 1 and self.scheduler is not None:
                self.scheduler.step()

            if self.scheduler is not None:
                lr = self.scheduler.get_last_lr()[0]
            else:
                lr = self.args.learning_rate

            if writer is not None:
                writer.add_scalar('Train/learning_rate', lr, epoch)
                writer.add_scalar('Train/gamma', self.gamma_scheduler[epoch],
                                  epoch)
            logging.info(
                '### Epoch: [%d/%d], Learning rate: %e, Gamma: %e ###',
                self.args.epochs, epoch, lr, self.gamma_scheduler[epoch])

            train_obj, train_main_obj, train_kl, train_error_metric, train_ece = self.train(
                epoch, train_loader, self.optimizer, logging, writer)

            logging.info(
                '#### Train | Error: %f, Train loss: %f, Train main objective: %f, Train KL: %f, Train ECE %f ####',
                train_error_metric, train_obj, train_main_obj, train_kl,
                train_ece)

            if writer is not None:
                self._scalar_logging(train_obj, train_main_obj, train_kl,
                                     train_error_metric, train_ece, "Train/",
                                     epoch, writer)

            # validation
            val_obj, val_main_obj, val_kl, val_error_metric, val_ece = self.infer(
                epoch, valid_loader, logging, writer, "Valid")
            logging.info(
                '#### Valid | Error: %f, Valid loss: %f, Valid main objective: %f, Valid KL: %f, Valid ECE %f ####',
                val_error_metric, val_obj, val_main_obj, val_kl, val_ece)

            if writer is not None:
                self._scalar_logging(val_obj, val_main_obj, val_kl,
                                     val_error_metric, val_ece, "Valid/",
                                     epoch, writer)

            if val_error_metric <= best_error or self.args.save_last:
                special_infor = ""
                # Avoid correlation between the samples
                if hasattr(
                        self.args, 'burnin_epochs'
                ) and epoch >= self.args.burnin_epochs and epoch % 2 == 0:
                    special_infor = "_" + str(epoch)
                utils.save_model(self.model, self.args, special_infor)
                best_error = val_error_metric
                logging.info(
                    '### Epoch: [%d/%d], Saving model! Current best error: %f ###',
                    self.args.epochs, epoch, best_error)

        return best_error, self.train_time, self.val_time
Example #3
0
 def fit(self, X, y):
     """
     Classify given data
     Input:
         X: (N, M) matrix of N training data samples
         y: (N) vector of N training data labels
     """
     self.__clf.fit(X, y)
     if self.__pickle is not None:
         utils.save_model(self.__clf, self.__pickle)
Example #4
0
def hmm_train_eval(train_data, test_data, word2id, tag2id, remove_0=False):
    train_word_lists, train_tag_lists = train_data
    test_word_lists, test_tag_lists = test_data
    model = HMM(len(tag2id), len(word2id))
    model.train(train_word_lists, train_tag_lists, word2id, tag2id)

    save_model(model, '../models/st_models/deepNER/hmm.pkl')

    pred_tag_lists = model.test(test_word_lists, word2id, tag2id)
    metrics = Metrics(test_tag_lists, pred_tag_lists)
    metrics.report_scores(dtype='HMM')
Example #5
0
def crf_train_eval(train_data, test_data, remove_0=False):
    print('CRF模型评估训练')
    train_word_lists, train_tag_lists = train_data
    test_word_lists, test_tag_lists = test_data
    model = CRFModel()
    model.train(train_word_lists, train_tag_lists)
    save_model(model, '../models/st_models/deepNER/crf.pkl')

    pred_tag_lists = model.test(test_word_lists)
    metrics = Metrics(test_tag_lists, pred_tag_lists)
    metrics.report_scores(dtype='CRF')
Example #6
0
            # create network
            torch.manual_seed(seed)
            n_in, n_out = get_data_dimensions(dataset)
            net = Net(n_in,
                      n_hidden,
                      n_out,
                      n_layer,
                      act=act,
                      noise_type=noise_type,
                      noise_level=noise_level,
                      init_val=init_val).to(device, dtype)

            save_model(net,
                       experiment_name,
                       0,
                       noise_type,
                       noise_level,
                       model_dir=model_dir)
        else:
            print("starting from epoch {}".format(start_epoch))
            net = recreate_model(model_to_load, dataset=dataset, act=act)

        # optimiser parameters
        optimiser = get_optimiser(net.parameters(), op, learning_rate,
                                  momentum)

        # training criterion
        criterion = torch.nn.CrossEntropyLoss()

        # train network
        train(net,
Example #7
0
 def save_to_mlflow(self, is_remote=False):
     save_model(self, log_to_mlflow=True, is_remote=is_remote)
Example #8
0
def train_vcae(n_epochs,
               model,
               train_iterator,
               val_iterator,
               optimizer,
               device,
               criterion,
               save_best=True,
               verbose=True,
               is_nf=False,
               nf=None):
    model_name = 'NormalizingFlow' + model.__class__.__name__ if is_nf else model.__class__.__name__
    writer, experiment_name, best_model_path = setup_experiment(model_name,
                                                                log_dir="./tb")

    mb = master_bar(range(n_epochs))

    train_losses, val_losses = [], []
    best_val_loss = float('+inf')

    for epoch in mb:
        train_loss = run_epoch(model,
                               train_iterator,
                               optimizer,
                               criterion,
                               mb,
                               phase='train',
                               epoch=epoch,
                               writer=writer,
                               is_nf=is_nf,
                               nf=nf,
                               device=device)

        val_loss = run_epoch(model,
                             val_iterator,
                             None,
                             criterion,
                             mb,
                             phase='val',
                             epoch=epoch,
                             writer=writer,
                             is_nf=is_nf,
                             nf=nf,
                             device=device)

        # save logs
        dict_saver = {}
        dict_saver.update({'train_loss_mean': train_loss})
        dict_saver.update({'test_loss_mean': val_loss})
        file_to_save_path = ''.join(
            [LOG_PATH, FILE_NAME, experiment_name, FILE_EXCITON])
        save_to_file(file_to_save_path, dict_saver)

        # save the best model
        if save_best and (val_loss < best_val_loss):
            best_val_loss = val_loss
            save_model(nf if is_nf else model, best_model_path)

        if verbose:
            # append to a list for real-time plotting
            train_losses.append(train_loss)
            val_losses.append(val_loss)

            # start plotting for notebook
            mb.main_bar.comment = f'EPOCHS, best_loss:{best_val_loss}'
            mb.child.comment = f"train_loss:{round(train_loss, 3)}, val_loss:{round(val_loss, 3)}"
            plot_loss_update(epoch, n_epochs, mb, train_losses, val_losses)

    return best_model_path
Example #9
0
def train(args,
          model,
          train_loader,
          eval_loader,
          num_epochs,
          output,
          opt=None,
          s_epoch=0):
    device = args.device
    lr_default = args.lr
    lr_decay_step = 2
    lr_decay_rate = .25
    lr_decay_epochs = range(
        10, 20, lr_decay_step) if eval_loader is not None else range(
            10, 20, lr_decay_step)
    gradual_warmup_steps = [
        0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default
    ]
    saving_epoch = 0
    grad_clip = args.clip_norm
    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt

    # Initial loss function
    criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')

    logger = utils.Logger(os.path.join(output, 'log.txt'))
    logger.write(args.__repr__())
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))

    # Create trainer
    trainer = Trainer(args, model, criterion, optim)
    update_freq = int(args.update_freq)
    wall_time_start = time.time()
    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        num_updates = 0
        t = time.time()
        N = len(train_loader.dataset)
        num_batches = int(N / args.batch_size + 1)
        if epoch < len(gradual_warmup_steps):
            trainer.optimizer.param_groups[0]['lr'] = gradual_warmup_steps[
                epoch]
            logger.write('gradual warmup lr: %.8f' %
                         trainer.optimizer.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            trainer.optimizer.param_groups[0]['lr'] *= lr_decay_rate
            logger.write('decreased lr: %.8f' %
                         trainer.optimizer.param_groups[0]['lr'])
        else:
            logger.write('lr: %.8f' % trainer.optimizer.param_groups[0]['lr'])
        for i, (v, b, q, a, ans_mc, ans_gt) in enumerate(train_loader):
            v = v.to(device)
            b = b.to(device)
            q = q.to(device)
            a = a.to(device)
            ans_mc = ans_mc.to(device)

            # Clone each sample to 4 samples
            v = v.unsqueeze(1).expand(v.size(0), 4, v.size(1),
                                      v.size(2)).contiguous().view(
                                          v.size(0) * 4, v.size(1), v.size(2))
            q = q.unsqueeze(1).expand(q.size(0), 4,
                                      q.size(1)).contiguous().view(
                                          q.size(0) * 4, q.size(1))
            ans_mc = ans_mc.view(
                ans_mc.size(0) * ans_mc.size(1), ans_mc.size(2))
            a = a.view(ans_mc.size(0), 1)
            labels = torch.cat([a, 1 - a], 1)
            labels = labels.to(device)

            sample = [v, b, q, labels, ans_mc]
            if i < num_batches - 1 and (i + 1) % update_freq > 0:
                trainer.train_step(sample, update_params=False)
            else:
                loss, grad_norm, batch_score = trainer.train_step(
                    sample, update_params=True)
                total_norm += grad_norm
                count_norm += 1
                total_loss += loss.item()
                train_score += batch_score
                num_updates += 1
                if num_updates % int(args.print_interval / update_freq) == 0:
                    print(
                        "Iter: {}, Loss {:.4f}, Norm: {:.4f}, Total norm: {:.4f}, Num updates: {}, Wall time: {:.2f},"
                        " ETA: {}".format(i + 1,
                                          total_loss / ((num_updates + 1)),
                                          grad_norm, total_norm, num_updates,
                                          time.time() - wall_time_start,
                                          utils.time_since(t,
                                                           i / num_batches)))

        total_loss /= num_updates
        train_score = 100 * train_score / (num_updates * args.batch_size)
        if eval_loader is not None:
            print("Evaluating...")
            trainer.model.train(False)
            eval_score, bound = evaluate(model, eval_loader, args)
            trainer.model.train(True)

        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' %
                     (total_loss, total_norm / count_norm, train_score))
        if eval_loader is not None:
            logger.write('\teval score: %.2f (%.2f)' %
                         (100 * eval_score, 100 * bound))

        # Save per epoch
        if epoch >= saving_epoch:
            model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
            utils.save_model(model_path, model, epoch, trainer.optimizer)
            # Save best epoch
            if eval_loader is not None and eval_score > best_eval_score:
                model_path = os.path.join(output, 'model_epoch_best.pth')
                utils.save_model(model_path, model, epoch, trainer.optimizer)
                best_eval_score = eval_score
def main(optin):
    if not os.path.exists('checkpoint/' + optin.exp):
        os.makedirs('checkpoint/' + optin.exp)

    model = PRN(optin.node_count, optin.coeff).cuda()
    #model = torch.nn.DataParallel(model).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=optin.lr)
    criterion = torch.nn.BCELoss().cuda()

    print(model)
    print(">>> total params: {:.2f}M".format(
        sum(p.numel() for p in model.parameters()) / 1000000.0))

    save_options(optin, os.path.join('checkpoint/' + optin.exp),
                 model.__str__(), criterion.__str__(), optimizer.__str__())

    print('---------Loading Coco Training Set--------')
    coco_train = COCO(
        os.path.join('data/annotations/person_keypoints_train2017.json'))
    trainloader = DataLoader(dataset=CocoDataset(coco_train, optin),
                             batch_size=optin.batch_size,
                             num_workers=optin.num_workers,
                             shuffle=True)

    bar = Bar('-->', fill='>', max=len(trainloader))

    cudnn.benchmark = True
    for epoch in range(optin.number_of_epoch):
        print('-------------Training Epoch {}-------------'.format(epoch))
        print('Total Step:', len(trainloader), '| Total Epoch:',
              optin.number_of_epoch)
        lr = adjust_lr(optimizer, epoch, optin.lr_gamma)
        print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr))
        for idx, (input, label) in tqdm(enumerate(trainloader)):

            input = input.cuda().float()
            label = label.cuda().float()

            outputs = model(input)

            optimizer.zero_grad()
            loss = criterion(outputs, label)
            loss.backward()
            optimizer.step()

            if idx % 200 == 0:
                bar.suffix = 'Epoch: {epoch} Total: {ttl} | ETA: {eta:} | loss:{loss}' \
                .format(ttl=bar.elapsed_td, eta=bar.eta_td, loss=loss.data, epoch=epoch)
                bar.next()

        Evaluation(model, optin)

        save_model(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            },
            checkpoint='checkpoint/' + optin.exp)

        model.train()
Example #11
0
    bleus_2_val.append(bleu_2_val)
    bleus_3_val.append(bleu_3_val)
    bleus_4_val.append(bleu_4_val)
    meteors_val.append(meteor_score_val)

    # save evaluation scores of the validation
    save_list_to_file(meteors_val, save_model_path, 'meteor_val_list.json')
    save_list_to_file(bleus_1_val, save_model_path, 'bleus1_val_list.json')
    save_list_to_file(bleus_2_val, save_model_path, 'bleus2_val_list.json')
    save_list_to_file(bleus_3_val, save_model_path, 'bleus3_val_list.json')
    save_list_to_file(bleus_4_val, save_model_path, 'bleus4_val_list.json')

    # save model if model achieves better evaluation scores on the validation set
    meteor_best = save_model(reference_value=meteor_best,
                             candidate_value=meteor_score_val,
                             model=model,
                             path=save_model_path,
                             model_name='model_val_meteor')
    meteor_train_best = save_model(reference_value=meteor_train_best,
                                   candidate_value=meteor_score_train,
                                   model=model,
                                   path=save_model_path,
                                   model_name='model_train_meteor')
    save_model(reference_value=b1_best,
               candidate_value=bleu_1_val,
               model=model,
               path=save_model_path,
               model_name='model_bleu_1')
    save_model(reference_value=b2_best,
               candidate_value=bleu_2_val,
               model=model,
Example #12
0
    def train(self, model, tr_loader, va_loader=None, adv_train=False):
        args = self.args
        logger = self.logger

        opt = torch.optim.SGD(model.parameters(),
                              args.learning_rate,
                              weight_decay=args.weight_decay,
                              momentum=args.momentum)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            opt, milestones=[40000, 60000], gamma=0.1)
        _iter = 0

        begin_time = time()

        for epoch in range(1, args.max_epoch + 1):
            for data, label in tr_loader:
                data, label = tensor2cuda(data), tensor2cuda(label)

                if adv_train:
                    # When training, the adversarial example is created from a random
                    # close point to the original data point. If in evaluation mode,
                    # just start from the original data point.
                    adv_data = self.attack.perturb(data, label, 'mean', True)
                    output = model(adv_data, _eval=False)
                else:
                    output = model(data, _eval=False)

                loss = F.cross_entropy(output, label)

                opt.zero_grad()
                loss.backward()
                opt.step()

                if _iter % args.n_eval_step == 0:
                    t1 = time()

                    if adv_train:
                        with torch.no_grad():
                            stand_output = model(data, _eval=True)
                        pred = torch.max(stand_output, dim=1)[1]

                        # print(pred)
                        std_acc = evaluate(pred.cpu().numpy(),
                                           label.cpu().numpy()) * 100

                        pred = torch.max(output, dim=1)[1]
                        # print(pred)
                        adv_acc = evaluate(pred.cpu().numpy(),
                                           label.cpu().numpy()) * 100

                    else:

                        adv_data = self.attack.perturb(data, label, 'mean',
                                                       False)

                        with torch.no_grad():
                            adv_output = model(adv_data, _eval=True)
                        pred = torch.max(adv_output, dim=1)[1]
                        # print(label)
                        # print(pred)
                        adv_acc = evaluate(pred.cpu().numpy(),
                                           label.cpu().numpy()) * 100

                        pred = torch.max(output, dim=1)[1]
                        # print(pred)
                        std_acc = evaluate(pred.cpu().numpy(),
                                           label.cpu().numpy()) * 100

                    t2 = time()

                    logger.info(
                        f'epoch: {epoch}, iter: {_iter}, lr={opt.param_groups[0]["lr"]}, '
                        f'spent {time()-begin_time:.2f} s, tr_loss: {loss.item():.3f}'
                    )

                    logger.info(
                        f'standard acc: {std_acc:.3f}%, robustness acc: {adv_acc:.3f}%'
                    )

                    # begin_time = time()

                    # if va_loader is not None:
                    #     va_acc, va_adv_acc = self.test(model, va_loader, True)
                    #     va_acc, va_adv_acc = va_acc * 100.0, va_adv_acc * 100.0

                    #     logger.info('\n' + '='*30 + ' evaluation ' + '='*30)
                    #     logger.info('test acc: %.3f %%, test adv acc: %.3f %%, spent: %.3f' % (
                    #         va_acc, va_adv_acc, time() - begin_time))
                    #     logger.info('='*28 + ' end of evaluation ' + '='*28 + '\n')

                    begin_time = time()

                if _iter % args.n_store_image_step == 0:
                    tv.utils.save_image(
                        torch.cat([data.cpu(), adv_data.cpu()], dim=0),
                        os.path.join(args.log_folder, f'images_{_iter}.jpg'),
                        nrow=16)

                if _iter % args.n_checkpoint_step == 0:
                    file_name = os.path.join(args.model_folder,
                                             f'checkpoint_{_iter}.pth')
                    save_model(model, file_name)

                _iter += 1
                # scheduler depends on training interation
                scheduler.step()

            if va_loader is not None:
                t1 = time()
                va_acc, va_adv_acc = self.test(model, va_loader, True, False)
                va_acc, va_adv_acc = va_acc * 100.0, va_adv_acc * 100.0

                t2 = time()
                logger.info('\n'+'='*20 +f' evaluation at epoch: {epoch} iteration: {_iter} ' \
                    +'='*20)
                logger.info(
                    f'test acc: {va_acc:.3f}%, test adv acc: {va_adv_acc:.3f}%, spent: {t2-t1:.3f} s'
                )
                logger.info('=' * 28 + ' end of evaluation ' + '=' * 28 + '\n')
Example #13
0
def main():

    arg = args()

    if not os.path.exists(arg.exp_name):
        os.makedirs(arg.exp_name)

    assert arg.exp_name.split(
        '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..."
    output_dir = arg.exp_name

    if arg.local_rank == 0:
        save_scripts_in_exp_dir(output_dir)

    logger = logging_set(output_dir, arg.local_rank)
    logger.info(arg)
    logger.info(
        '\n================ experient name:[{}] ===================\n'.format(
            arg.exp_name))
    os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    np.random.seed(0)
    torch.manual_seed(0)

    config = edict(yaml.load(open(arg.cfg, 'r')))

    if arg.search:
        assert arg.search in [
            'None', 'sync', 'random', 'second_order_gradient',
            'first_order_gradient'
        ]
        config.train.arch_search_strategy = arg.search

    if arg.batchsize:
        logger.info("update batchsize to {}".format(arg.batchsize))
        config.train.batchsize = arg.batchsize

    config.num_workers = arg.num_workers

    print(
        'GPU memory : \ntotal | used\n',
        os.popen(
            'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader'
        ).read())

    logger.info(
        '------------------------------ configuration ---------------------------'
    )
    logger.info(
        '\n==> available {} GPUs , use numbers are {} device is {}\n'.format(
            torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"],
            torch.cuda.current_device()))
    # torch.cuda._initialized = True
    logger.info(pprint.pformat(config))
    logger.info(
        '------------------------------- -------- ----------------------------'
    )

    best = 0

    criterion = MSELoss()

    Arch = bulid_up_network(config, criterion)

    if config.train.arch_search_strategy == 'random':

        logger.info("==>random seed is {}".format(config.train.random_seed))
        np.random.seed(config.train.random_seed)
        torch.manual_seed(config.train.random_seed)

        Arch.arch_parameters_random_search()

    if arg.param_flop:
        Arch._print_info()

    if len(arg.gpu) > 1:
        use_multi_gpu = True

        if arg.distributed:
            torch.distributed.init_process_group(backend="nccl")
            #torch.distributed.init_process_group(backend="nccl",init_method='env://')
            local_rank = torch.distributed.get_rank()
            torch.cuda.set_device(local_rank)
            device = torch.device("cuda", local_rank)
            Arch.to(device)

            Arch = torch.nn.parallel.DistributedDataParallel(
                Arch,
                device_ids=[local_rank],
                output_device=local_rank,
                find_unused_parameters=True)
            logger.info("local rank = {}".format(local_rank))
        else:
            Arch = torch.nn.DataParallel(Arch).cuda()
    else:
        use_multi_gpu = False
        Arch = Arch.cuda()

    Search = Search_Arch(Arch.module,
                         config) if use_multi_gpu else Search_Arch(
                             Arch, config)  # Arch.module for nn.DataParallel

    search_strategy = config.train.arch_search_strategy

    if not arg.distributed:
        train_queue, arch_queue, valid_queue = Dataloaders(
            search_strategy, config, arg)
    else:
        train_queue, \
        arch_queue, \
        valid_queue, \
        train_sampler_dist, = Dataloaders(search_strategy,config,arg)
    #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None!

    logger.info(
        "\nNeural Architecture Search strategy is {}".format(search_strategy))
    assert search_strategy in [
        'first_order_gradient', 'random', 'None', 'second_order_gradient',
        'sync'
    ]

    if search_strategy == 'sync':
        # arch_parameters is also registered to model's parameters
        # so the weight-optimizer will also update the arch_parameters
        logger.info(
            "sync: The arch_parameters is also optimized by weight-optmizer synchronously"
        )
        optimizer = torch.optim.Adam(
            Arch.parameters(),
            lr=config.train.w_lr_cosine_begin,
        )

    else:
        # if search strategy is None,random,second_order_gradient and so on
        # the arch_parameters will be filtered by the weight-optimizer
        optimizer = torch.optim.Adam(
            filter_arch_parameters(Arch),
            lr=config.train.w_lr_cosine_begin,
        )
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer,  step_size = config.train.lr_step_size,
    #                                                       gamma = config.train.lr_decay_gamma )
    if config.train.scheduler_name == "MultiStepLR":
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.train.LR_STEP, config.train.LR_FACTOR)
    elif config.train.scheduler_name == "CosineAnnealingLR":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=config.train.epoch_end,
            eta_min=config.train.w_lr_cosine_end)

    # best_result

    logger.info(
        "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=="
    )
    begin, end = config.train.epoch_begin, config.train.epoch_end

    if arg.load_ckpt:
        if use_multi_gpu:
            begin, best = load_ckpt(Arch.module, optimizer, scheduler,
                                    output_dir, logger)
        else:
            begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir,
                                    logger)

    for epoch in range(begin, end):

        lr = scheduler.get_lr()[0]
        logger.info(
            '==>time:({})--training...... current learning rate is {:.7f}'.
            format(datetime.datetime.now(), lr))

        if arg.distributed:
            train_sampler_dist.set_epoch(epoch)
            #valid_sampler_dist.set_epoch(epoch)

        train(
            epoch,
            train_queue,
            arch_queue,
            Arch,
            Search,
            criterion,
            optimizer,
            lr,
            search_strategy,
            output_dir,
            logger,
            config,
            arg,
        )
        scheduler.step()

        if not arg.distributed or (arg.distributed and arg.local_rank == 0):

            eval_results = evaluate(Arch, valid_queue, config, output_dir)

            if use_multi_gpu:
                best = save_model(epoch, best, eval_results, Arch.module,
                                  optimizer, scheduler, output_dir, logger)
            else:
                best = save_model(epoch, best, eval_results, Arch, optimizer,
                                  scheduler, output_dir, logger)
Example #14
0
 def on_batch_k_examples(self, batch, logs):
     save_model(model=self.model,
                optimizer=self.optimizer,
                filename=os.path.join(self.save_path, f"model_it{self.it}"))
     self.it += 1
Example #15
0
    def train(self,
              train,
              dev=None,
              test=None,
              to_predict=None,
              max_iter=5,
              batch_size=128,
              test_batch_size=1000,
              pre_test_batch=25,
              predict_path=None):
        train_x, train_y = train
        self.set_train_data(train)
        train_index = align_batch_size(range(len(train_y)), batch_size)
        train_x_length = np.sum((train_x > 0), axis=1)
        num_batch = len(train_index) / batch_size
        batch_list = range(num_batch)
        log_loss_history, acc_history = list(), list()
        batch_log_loss_history, batch_acc_history = list(), list()
        logger.info("start training")
        batch_count = 0
        best_dev_acc = 0
        for i in xrange(max_iter):
            iter_loss_list = list()
            iter_acc_list = list()
            batch_list = np.random.permutation(batch_list)
            for j in batch_list:
                set_dropout_on(True)
                batch_count += 1
                indexs = train_index[j * batch_size:(j + 1) * batch_size]
                max_len = np.max(train_x_length[indexs])
                self.train_batch(indexs, max_len)
                if batch_count % pre_test_batch == 0:
                    set_dropout_on(False)
                    batch_log_loss, batch_acc = [batch_count], [batch_count]
                    if dev is not None:
                        dev_x, dev_y = dev
                        dev_acc, dev_log_loss = self.predict_data_log_loss_acc(
                            dev_x, dev_y, test_batch_size)
                        batch_log_loss.append(dev_log_loss)
                        batch_acc.append(dev_acc)
                        if dev_acc > best_dev_acc:
                            best_dev_acc = dev_acc
                            save_model("model/%s.best.model" % predict_path,
                                       self)
                        logger.info("batch %d,   dev log loss %s, acc %s" %
                                    (batch_count, dev_log_loss, dev_acc))
                    if test is not None:
                        test_x, test_y = test
                        test_acc, test_log_loss = self.predict_data_log_loss_acc(
                            test_x, test_y, test_batch_size)
                        batch_log_loss.append(test_log_loss)
                        batch_acc.append(test_acc)
                        logger.info("batch %d,  test log loss %s, acc %s" %
                                    (batch_count, test_log_loss, test_acc))
                    batch_log_loss_history.append(batch_log_loss)
                    batch_acc_history.append(batch_acc)
            set_dropout_on(False)
            train_acc, train_log_loss = self.predict_data_log_loss_acc(
                train_x, train_y, test_batch_size)
            iter_loss_list.append(train_log_loss)
            iter_acc_list.append(train_acc)
            iter_l2_loss, iter_l2_norm = self.get_l2_loss()
            logger.info("epoch %d, param l2 losss %s, l2 norm %s" %
                        (i, iter_l2_loss, iter_l2_norm))
            logger.info("epoch %d, train log loss %s, acc %s" %
                        (i, train_log_loss, train_acc))
            if dev is not None:
                dev_x, dev_y = dev
                dev_acc, dev_log_loss = self.predict_data_log_loss_acc(
                    dev_x, dev_y, test_batch_size)
                logger.info("epoch %d,   dev log loss %s, acc %s" %
                            (i, dev_log_loss, dev_acc))
                if dev_acc > best_dev_acc:
                    best_dev_acc = dev_acc
                    save_model("model/%s.best.model" % predict_path, self)
                iter_loss_list.append(dev_log_loss)
                iter_acc_list.append(dev_acc)
            if test is not None:
                test_x, test_y = test
                test_acc, test_log_loss = self.predict_data_log_loss_acc(
                    test_x, test_y, test_batch_size)
                logger.info("epoch %d,  test log loss %s, acc %s" %
                            (i, test_log_loss, test_acc))
                iter_loss_list.append(test_log_loss)
                iter_acc_list.append(test_acc)
            log_loss_history.append(iter_loss_list)
            acc_history.append(iter_acc_list)

        # Log Best Epoch
        log_loss_history = np.array(log_loss_history)
        acc_history = np.array(acc_history)

        # Log Best Batch
        batch_log_loss_history = np.array(batch_log_loss_history)
        batch_acc_history = np.array(batch_acc_history)
        self.log_to_file("Epoch", log_loss_history, acc_history)
        self.log_to_file("Batch", batch_log_loss_history, batch_acc_history)
        save_model("model/%s.final.model" % predict_path, self)
    '+++++++++++++Calculating batch stats for normalizing+++++++++++++')
imagenet_stats = {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]}
train_ds, valid_ds = train.get_datasets(path_dogs,
                                        human_train,
                                        human_valid,
                                        stats=imagenet_stats,
                                        size=args.img_size)
bs = args.batch_size
dls = train.get_dls(train_ds, valid_ds, bs=bs)
train.display_message(
    '+++++++++++++Getting Model ready for training+++++++++++++')
model = models.ModelTransfer()
device = train.get_device()
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr)
criterion = loss_func.CustomLoss(train_ds.dog_human_labeller)
recorder = metrics.Recorder()
n_epochs = args.n_epochs
modelutils.freeze(model.model)
train.run(n_epochs,
          model,
          optimizer,
          criterion,
          dls,
          device,
          recorder,
          max_lr=args.max_lr,
          env='shell')
utils.save_model(model, f'model_transfer_{n_epochs}_{bs}_{args.lr}',
                 train_ds.breed_labeller, train_ds.dog_human_labeller,
                 imagenet_stats)
Example #17
0
 def save_weights_fnc(epoch, logs):
     if epoch % save_freq == 0:
         logger.info("Saving model from epoch " + str(epoch))
         save_model(model, optimizer, os.path.join(save_path, "model_last_epoch"))
Example #18
0
def training_loop(model, loss_function, metrics, optimizer,
                  config, save_path, datasets, steps_per_epoch, seed,
                  custom_callbacks=[], checkpoint_monitor="val_categorical_accuracy:0",
                  use_tb=False, reload=False, evaluation_freq=1,
                  n_epochs=100, save_freq=1, save_history_every_k_examples=1,
                  load_weights_from="", load_weights_and_optimizer_from="",
                  weight_decay=0, load_classifier=True):
    if load_weights_and_optimizer_from != "":
        assert load_weights_from == ""
        assert load_weights_and_optimizer_from.endswith("h5")
        logger.info(f"load_weights_and_optimizer_from={load_weights_and_optimizer_from}")
        _, optimizer = restore_model_and_optimizer(model, optimizer, load_weights_and_optimizer_from)
        logger.info("Loaded optimizer")
        model.load_weights(load_weights_and_optimizer_from, by_name=True)
        model.optimizer = optimizer
    elif load_weights_from != "":
        assert load_weights_and_optimizer_from == ""
        model.load_weights(load_weights_from)

    if reload:
        assert load_classifier is True
        logger.warning("Only custom_callbacks can be reloaded for now")
        previous_model = model
        model, optimizer, H, epoch_start = _reload(model, optimizer, save_path, custom_callbacks)
        del previous_model
        logger.warning("Changed model reference internally in the training loop!")
    else:
        if hasattr(model, "compile"):
            model.compile(optimizer=optimizer,
                          loss=loss_function,
                          metrics=[m for m in metrics if not isinstance(m, str)])  # FIXME

        save_model(model, optimizer, os.path.join(save_path, "init_weights"))

        history_csv_path, history_pkl_path = os.path.join(save_path, "history.csv"), os.path.join(save_path,
                                                                                                  "history.pkl")
        logger.info("Removing {} and {}".format(history_pkl_path, history_csv_path))
        os.system("rm " + history_pkl_path)
        os.system("rm " + history_csv_path)
        H, epoch_start = {}, 0


    callbacks = list(custom_callbacks)
    callbacks += _construct_default_callbacks(model, optimizer, H, save_path, checkpoint_monitor,
                                              save_freq, custom_callbacks, use_tb,
                                              save_history_every_k_examples)

    # Configure callbacks
    for clbk in callbacks:
        clbk.set_save_path(save_path)
        clbk.set_optimizer(optimizer)
        clbk.set_model(model)
        clbk.set_seed(seed)
        clbk.set_datasets(datasets)
        clbk.set_config(config)
        clbk.set_callbacks(callbacks)

    _training_loop(model, datasets, optimizer, loss_function, epoch_start, n_epochs, callbacks,
                   steps_per_epoch, train_on_batch=_train_on_batch_optimized,
                   evaluate_model=evaluate,
                   metrics=metrics, evaluation_freq=evaluation_freq,
                   weight_decay=weight_decay)

    if save_freq != -1:
        save_model(model, optimizer, os.path.join(save_path, "model_last_epoch"))
Example #19
0
    def train(self, data, pickle_path=None):
        """
        Train this featurizer on the training set using the following procedure
        1. Compute the keypoint descriptors for each image. keypoint descriptors
           are M-length vectors for each detected keypoint in an image. There
           can be an arbitrary number of keypoints per image
        2. Perform k-means clustering on the set of all descriptors gathered
           from all images. The descriptors will be divided into K groups (K is
           specified in the constructor). The distribution of an image's
           keypoints among these K groups (wich was computed using each
           keypoint's descriptor) determines the feature vector of that image.
        3. For each image, go through the keypoints of that image and increment
           the element in the zero-initialized feature vector which corresponds
           to the label of that image. In essence, each keypoint of an image
           votes on which of K groups it thinks the image belongs to. These
           votes become the feature vector of that image.
        Input:
            data: (N,H,W[,C]) matrix of N training images
            pickle_path: location of pickle file to save/load model
        Output:
            features: (N, K) matrix of K-length feature vectors of N images
        """
        if pickle_path is not None:
            self.__kmeans = utils.load_model(pickle_path)
            if self.__kmeans is not None:
                self.__logger.warning(
                    "No pickle file found at {}".format(pickle_path))
                self.__logger.info(
                    "Computing {} descriptors using saved model".format(
                        self.__name.upper()))
                return self.test(data)

        # Use multiple processes to calculate descriptors
        # Use all but one thread if possible to prevent crashes when
        # using all threads. Pool doesn't like it when there is not much data
        # so if there is only need for 1 processor we don't pool. Otherwise we
        # will block indefinitely for some reason.
        n_images = data.shape[0]
        features = np.zeros((n_images, self.__vocab_size))
        nprocs = (os.cpu_count() - 1) if os.cpu_count() > 1 else 1
        nprocs = nprocs if n_images > (nprocs * 20) else 1
        self.__logger.info(
            "Computing {} descriptors using {} processes".format(
                self.__name.upper(), nprocs))
        if nprocs > 1:
            subsets = np.array_split(data, nprocs, axis=0)
            pool = Pool(processes=nprocs)
            pool_args = zip(subsets, itertools.repeat(self.__name))
            batch_descriptors = pool.map(_compute_features, pool_args)
            img_descriptors = np.concatenate([d for d in batch_descriptors])
        else:
            img_descriptors = _compute_features((data, self.__name))

        # Reshape data so that we can give k-means a list of descriptors
        # while maintaining a descriptor->image mapping which we will need
        # to generate features later
        img_ids, descriptors = [], []
        for i, img_desc in enumerate(img_descriptors):
            img_ids.extend([i] * len(img_desc))
            descriptors.extend([desc for desc in img_desc])

        # k-means to determine a feature vector for each image
        # Use MiniBatchKmeans for speed
        self.__kmeans = MiniBatchKMeans(
            batch_size=10,  # smaller batch for less memory use
            n_clusters=self.__vocab_size,
            init_size=(3 * self.__vocab_size)).fit(descriptors)

        # We can assume that the image IDs calculated before correspond
        # to the correct k-means label because MiniBatchKMeans preserves order
        # Otherwise we would have to use preddict(X) on each image (slow)
        for img_id, cluster_id in zip(img_ids, self.__kmeans.labels_):
            features[img_id, cluster_id] += 1

        if pickle_path is not None:
            utils.save_model(self.__kmeans, pickle_path)

        return features
Example #20
0
def train(args, model, optimizer, scheduler, tokenizer,ner_index, *,
          train_loader, valid_df, valid_loader, epoch_length,
          n_epochs=None):
    n_epochs = n_epochs or args.n_epochs

    run_root = Path('../experiments/' + args.run_root)
    model_path = run_root / ('tagger_model-%d.pt' % args.fold)
    best_model_path = run_root / ('best-model-%d.pt' % args.fold)
    if best_model_path.exists():
        state, best_valid_score = load_model(model, best_model_path)
        start_epoch = state['epoch']
        best_epoch = start_epoch
    else:
        best_valid_score = 0
        start_epoch = 0
        best_epoch = 0
    step = 0
    criterion = CrossEntropyLoss().cuda()
    report_each = 10000
    log = run_root.joinpath('train-%d.log' %
                            args.fold).open('at', encoding='utf8')

    for epoch in range(start_epoch, start_epoch + n_epochs):
        model.train()

        tq = tqdm.tqdm(total=epoch_length)
        losses = []

        mean_loss = 0
        device = torch.device("cuda", 0)
        for i, (ori_sen, token, token_type, start, end, insert_pos, start_ner, end_ner) in enumerate(train_loader):
            input_mask = (token > 0).to(device)
            token, input_mask, token_type, start, end, insert_pos, start_ner, end_ner = \
                token.to(device), input_mask.to(device), token_type.to(device), start.to(
                    device), end.to(device), insert_pos.to(device), start_ner.to(device), end_ner.to(device)
            outputs = model(input_ids=token, attention_mask=input_mask, token_type_ids=token_type,
                            start=start, end=end, insert_pos=insert_pos, start_ner=start_ner,
                            end_ner=end_ner)

            loss = outputs[0]
            if (i + 1) % args.step == 0:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
            else:
                with amp.scale_loss(loss, optimizer, delay_unscale=True) as scaled_loss:
                    scaled_loss.backward()

            tq.update(args.batch_size)
            losses.append(loss.item() * args.step)
            mean_loss = np.mean(losses[-report_each:])
            tq.set_postfix(loss=f'{mean_loss:.5f}')
            lr = get_learning_rate(optimizer)
            tq.set_description(f'Epoch {epoch}, lr {lr:.6f}')
            if i and i % report_each == 0:
                write_event(log, step, loss=mean_loss)
            # break
        write_event(log, step, epoch=epoch, loss=mean_loss)
        tq.close()

        valid_metrics = validate(model, valid_loader, valid_df, args, tokenizer, ner_index)
        # write_event(log, step, **valid_metrics)
        current_score = valid_metrics['rouge-1']['f']
        if current_score>best_valid_score:
            print('save success')
            save_model(model, epoch, step, mean_loss, model_path)
            best_valid_score = current_score
    return True
Example #21
0
    def train(self, model, tr_loader, va_loader, device, adv_train=False):

        args = self.args
        logger = self.logger

        criterion = nn.CrossEntropyLoss()
        opt = torch.optim.Adam(model.parameters(),
                               args.learning_rate,
                               betas=(0.9, 0.999),
                               eps=1e-08,
                               weight_decay=args.weight_decay)
        #scheduler = torch.optim.lr_scheduler.MultiStepLR(opt,
        #                                                 milestones=[2, 4, 6, 7, 8],
        #                                                 gamma=0.1)
        acc = 0.0
        valid_acc = 0.0
        best_acc = 0
        best_va_acc = 0
        running_loss = 0.0
        tr_loss_list = []
        val_loss_list = []

        correct = 0
        total = 0

        for epoch in range(1, args.max_epoch + 1):
            model.train()
            for data, label, paths in tr_loader:
                data, label = data.to(device), label.to(device)

                opt.zero_grad()
                output = model(data)

                loss = criterion(output, label)

                loss.backward()
                opt.step()

                running_loss += loss.item()

                _, pred = torch.max(output.data, dim=1)
                correct += (pred == label).sum().item()
                total += label.size(0)

            std_acc = (correct / total) * 100
            tr_loss = running_loss / total
            tr_loss_list.append(tr_loss)

            if va_loader is not None:
                model.eval()
                t1 = time()
                va_acc, va_loss = self.test(model, va_loader, device, False,
                                            True, criterion)

                va_acc = va_acc * 100.0
                val_loss_list.append(va_loss)

                t2 = time()
                logger.info('\n'+'='*20 +' evaluation at epoch: %d '%(epoch) \
                    +'='*20)
                logger.info(
                    'train acc: %.3f %%, train loss: %.3f, validation acc: %.3f %%, valid loss: %.3f, spent: %.3f'
                    % (std_acc, tr_loss, va_acc, va_loss, t2 - t1))
                logger.info('=' * 28 + ' end of evaluation ' + '=' * 28 + '\n')

            acc = std_acc
            valid_acc = va_acc

            if acc >= best_acc and valid_acc >= best_va_acc:
                best_acc = acc
                best_va_acc = valid_acc
                file_name = os.path.join(args.model_folder,
                                         'checkpoint_%d.pth' % epoch)
                save_model(model, file_name)
            #for Pytorch 1.0, opt.step() must be called before scheduler.step()
            #scheduler.step()
        plt.plot(tr_loss_list, c='blue', label='Training Loss')
        plt.plot(val_loss_list, c='green', label='Validation Loss')
        plt.xticks(range(1, args.max_epoch + 1))
        plt.ylim((0, 1))
        plt.legend(loc="upper right")
        plt.savefig(os.path.join(args.model_folder, 'loss_plot.png'))
        plt.close()
        print('Best Train Acc: {:4f}, Best Valid Acc: {:4f}'.format(
            best_acc, best_va_acc))
Example #22
0
train.display_message('+++++++++++++Creating DataLoaders+++++++++++++')
train_ds, valid_ds = train.get_datasets(path_dogs,
                                        human_train,
                                        human_valid,
                                        stats=batch_stat,
                                        size=args.img_size)
bs = args.batch_size
dls = train.get_dls(train_ds, valid_ds, bs=bs)
train.display_message(
    '+++++++++++++Getting Model ready for training+++++++++++++')
model = models.ModelScratch()
device = train.get_device()
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr)
criterion = loss_func.CustomLoss(train_ds.dog_human_labeller)
recorder = metrics.Recorder()
n_epochs = args.n_epochs
train.run(n_epochs,
          model,
          optimizer,
          criterion,
          dls,
          device,
          recorder,
          max_lr=args.max_lr,
          env='shell')
utils.save_model(
    model,
    f'model_scratch_{n_epochs}_{recorder.valid_acc_breed[-1].item():.2f}',
    train_ds.breed_labeller, train_ds.dog_human_labeller, batch_stat)
Example #23
0
 def save_model_param_to_file(self, filename):
     to_save = [param.get_value() for param in self.model_params]
     save_model(filename, model=to_save, compress=True)
Example #24
0
def train(cfg, model):
    criterion = factory.get_criterion(cfg)
    # optim = torch.optim.Adam(model.parameters(), lr=1e-3)
    optim = factory.get_optimizer(cfg, model.parameters())

    best = {
        'loss': float('inf'),
        'score': 0.0,
        'epoch': -1,
    }
    if "resume_from" in cfg.keys() and cfg["resume_from"]:
        detail = utils.load_model(cfg["resume_from"], model, optim=optim)
        best.update({
            'loss': detail['loss'],
            'score': detail['score'],
            'epoch': detail['epoch'],
        })

        # to set lr manually after resumed
        for param_group in optim.param_groups:
            param_group['lr'] = cfg["optimizer"]["param"]["lr"]
        log(f"initial lr {utils.get_lr(optim)}")

    scheduler, is_reduce_lr = factory.get_scheduler(cfg, optim)
    log(f"is_reduce_lr: {is_reduce_lr}")

    loader_train = factory.get_loader_train(cfg)
    loader_valid = factory.get_loader_valid(cfg)

    log('train data: loaded %d records' % len(loader_train.dataset))
    log('valid data: loaded %d records' % len(loader_valid.dataset))

    log('apex %s' % cfg["apex"])
    if cfg["apex"]:
        amp.initialize(model, optim, opt_level='O1')

    for epoch in range(best['epoch'] + 1, cfg["epoch"]):

        log(f'\n----- epoch {epoch} -----')

        run_nn(cfg,
               'train',
               model,
               loader_train,
               criterion=criterion,
               optim=optim,
               apex=cfg["apex"])

        with torch.no_grad():
            val = run_nn(cfg,
                         'valid',
                         model,
                         loader_valid,
                         criterion=criterion)

        detail = {
            'score': val['score'],
            'loss': val['loss'],
            'epoch': epoch,
        }
        if val['loss'] <= best['loss']:
            best.update(detail)
            utils.save_model(model,
                             optim,
                             detail,
                             cfg["fold"],
                             output_dir,
                             best=True)

        utils.save_model(model, optim, detail, cfg["fold"], output_dir)

        log('[best] ep:%d loss:%.4f score:%.4f' %
            (best['epoch'], best['loss'], best['score']))

        if is_reduce_lr:
            scheduler.step(val['loss'])  # reducelronplateau
        else:
            scheduler.step()
    def train(self, model, tr_loader, va_loader=None, adv_train=False):
        args = self.args
        logger = self.logger

        opt = torch.optim.Adam(model.parameters(),
                               args.learning_rate,
                               weight_decay=args.weight_decay)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(opt,
                                                         milestones=[100, 150],
                                                         gamma=0.1)
        _iter = 0

        begin_time = time()

        for epoch in range(1, args.max_epoch + 1):
            scheduler.step()
            for data, label in tr_loader:
                data, label = tensor2cuda(data), tensor2cuda(label)

                if adv_train:
                    # When training, the adversarial example is created from a random
                    # close point to the original data point. If in evaluation mode,
                    # just start from the original data point.
                    adv_data = self.attack.perturb(data, label, 'mean', True)
                    # output = model(adv_data, _eval=False)
                    # ????????? don't know if this is the case###########
                    model.train()
                    output = model(adv_data)
                else:
                    # output = model(data, _eval=False)
                    model.train()
                    output = model(data)

                loss = F.cross_entropy(output, label)

                opt.zero_grad()
                loss.backward()
                opt.step()

                if _iter % args.n_eval_step == 0:
                    t1 = time()

                    if adv_train:
                        with torch.no_grad():
                            model.eval()
                            stand_output = model(data)
                            # stand_output = model(data, _eval=True)
                        pred = torch.max(stand_output, dim=1)[1]

                        # print(pred)
                        std_acc = evaluate(pred.cpu().numpy(),
                                           label.cpu().numpy()) * 100

                        pred = torch.max(output, dim=1)[1]
                        # print(pred)
                        adv_acc = evaluate(pred.cpu().numpy(),
                                           label.cpu().numpy()) * 100

                    else:

                        adv_data = self.attack.perturb(data, label, 'mean',
                                                       False)

                        with torch.no_grad():
                            model.eval()
                            adv_output = model(adv_data)
                            # adv_output = model(adv_data, _eval=True)
                        pred = torch.max(adv_output, dim=1)[1]
                        # print(label)
                        # print(pred)
                        adv_acc = evaluate(pred.cpu().numpy(),
                                           label.cpu().numpy()) * 100

                        pred = torch.max(output, dim=1)[1]
                        # print(pred)
                        std_acc = evaluate(pred.cpu().numpy(),
                                           label.cpu().numpy()) * 100

                    t2 = time()

                    print('%.3f' % (t2 - t1))

                    logger.info(
                        'epoch: %d, iter: %d, spent %.2f s, tr_loss: %.3f' %
                        (epoch, _iter, time() - begin_time, loss.item()))

                    logger.info(
                        'standard acc: %.3f %%, robustness acc: %.3f %%' %
                        (std_acc, adv_acc))

                    # begin_time = time()

                    # if va_loader is not None:
                    #     va_acc, va_adv_acc = self.test(model, va_loader, True)
                    #     va_acc, va_adv_acc = va_acc * 100.0, va_adv_acc * 100.0

                    #     logger.info('\n' + '='*30 + ' evaluation ' + '='*30)
                    #     logger.info('test acc: %.3f %%, test adv acc: %.3f %%, spent: %.3f' % (
                    #         va_acc, va_adv_acc, time() - begin_time))
                    #     logger.info('='*28 + ' end of evaluation ' + '='*28 + '\n')

                    begin_time = time()

                if _iter % args.n_store_image_step == 0:
                    tv.utils.save_image(
                        torch.cat([data.cpu(), adv_data.cpu()], dim=0),
                        os.path.join(args.log_folder, 'images_%d.jpg' % _iter),
                        nrow=16)

                if _iter % args.n_checkpoint_step == 0:
                    file_name = os.path.join(args.model_folder,
                                             'checkpoint_%d.pth' % _iter)
                    save_model(model, file_name)

                _iter += 1

            if va_loader is not None:
                t1 = time()
                va_acc, va_adv_acc = self.test(model, va_loader, True, False)
                va_acc, va_adv_acc = va_acc * 100.0, va_adv_acc * 100.0

                t2 = time()
                logger.info('\n'+'='*20 +' evaluation at epoch: %d iteration: %d '%(epoch, _iter) \
                    +'='*20)
                logger.info(
                    'test acc: %.3f %%, test adv acc: %.3f %%, spent: %.3f' %
                    (va_acc, va_adv_acc, t2 - t1))
                logger.info('=' * 28 + ' end of evaluation ' + '=' * 28 + '\n')
Example #26
0
                vmax=run_params['pipeline']['normalisation'][1])
            # evaluate performance of encoder / generator
            model.forward_and_save_one_image(
                validation[index]['image'].unsqueeze(0),
                validation[index]['label'],
                epoch,
                to_mlflow=log_to_mlflow,
                is_remote=remote_run,
                vmin=run_params['pipeline']['normalisation'][0],
                vmax=run_params['pipeline']['normalisation'][1])

        # Checkpoints
        if 'checkpoint_frequency' in run_params and epoch % run_params[
                'checkpoint_frequency'] == 0:
            save_model(model,
                       log_to_mlflow=log_to_mlflow,
                       epoch=epoch,
                       is_remote=remote_run)

    print('=========Training ended==========')

    # Test performance
    test_metric = model.evaluate(test_loader, log_to_mlflow=log_to_mlflow)
    test_metrics.append(test_metric)

    # Saving
    save_model(model, log_to_mlflow=log_to_mlflow, is_remote=remote_run)
    mlflow.end_run()

# Averaging metrics for different random-seeds
avg_metric = dict(pd.DataFrame(test_metrics).mean())
if log_to_mlflow:
Example #27
0
def main():

    arg = args()

    if not os.path.exists(arg.exp_name):
        os.makedirs(arg.exp_name)

    assert arg.exp_name.split(
        '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..."
    output_dir = arg.exp_name

    save_scripts_in_exp_dir(output_dir)

    logger = logging_set(output_dir)

    logger.info(
        '\n================ experient name:[{}] ===================\n'.format(
            arg.exp_name))
    os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    np.random.seed(0)
    torch.manual_seed(0)

    config = edict(yaml.load(open(arg.cfg, 'r')))

    if arg.search:
        assert arg.search in [
            'None', 'sync', 'random', 'second_order_gradient',
            'first_order_gradient'
        ]
        config.train.arch_search_strategy = arg.search

    if arg.batchsize:
        logger.info("update batchsize to {}".format(arg.batchsize))
        config.train.batchsize = arg.batchsize

    config.num_workers = arg.num_workers

    print(
        'GPU memory : \ntotal | used\n',
        os.popen(
            'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader'
        ).read())

    logger.info(
        '------------------------------ configuration ---------------------------'
    )
    logger.info(
        '\n==> available {} GPUs , use numbers are {} device is {}\n'.format(
            torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"],
            torch.cuda.current_device()))
    # torch.cuda._initialized = True
    logger.info(pprint.pformat(config))
    logger.info(
        '------------------------------- -------- ----------------------------'
    )

    criterion = MSELoss()

    Arch = bulid_up_network(config, criterion)

    if config.train.arch_search_strategy == 'random':

        logger.info("==>random seed is {}".format(config.train.random_seed))
        np.random.seed(config.train.random_seed)
        torch.manual_seed(config.train.random_seed)
        Arch.arch_parameters_random_search()

    if arg.param_flop:
        Arch._print_info()

    # dump_input = torch.rand((1,3,128,128))
    # graph = SummaryWriter(output_dir+'/log')
    # graph.add_graph(Arch, (dump_input, ))

    if len(arg.gpu) > 1:
        use_multi_gpu = True
        Arch = torch.nn.DataParallel(Arch).cuda()
    else:
        use_multi_gpu = False
        Arch = Arch.cuda()

    Search = Search_Arch(Arch.module,
                         config) if use_multi_gpu else Search_Arch(
                             Arch, config)  # Arch.module for nn.DataParallel
    search_strategy = config.train.arch_search_strategy
    train_queue, arch_queue, valid_queue = Dataloaders(search_strategy, config,
                                                       arg)
    #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None!

    logger.info(
        "\nNeural Architecture Search strategy is {}".format(search_strategy))
    assert search_strategy in [
        'first_order_gradient', 'random', 'None', 'second_order_gradient',
        'sync'
    ]

    if search_strategy == 'sync':
        # arch_parameters is also registered to model's parameters
        # so the weight-optimizer will also update the arch_parameters
        logger.info(
            "sync: The arch_parameters is also optimized by weight-optmizer synchronously"
        )
        optimizer = torch.optim.Adam(
            Arch.parameters(),
            lr=config.train.w_lr_cosine_begin,
        )

    else:
        # if search strategy is None,random,second_order_gradient and so on
        # the arch_parameters will be filtered by the weight-optimizer
        optimizer = torch.optim.Adam(
            filter_arch_parameters(Arch),
            lr=config.train.w_lr_cosine_begin,
        )
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer,  step_size = config.train.lr_step_size,
    #                                                       gamma = config.train.lr_decay_gamma )
    if config.train.scheduler_name == "MultiStepLR":
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.train.LR_STEP, config.train.LR_FACTOR)
    elif config.train.scheduler_name == "CosineAnnealingLR":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=config.train.epoch_end,
            eta_min=config.train.w_lr_cosine_end)

    # best_result
    best = 0

    logger.info(
        "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=="
    )
    begin, end = config.train.epoch_begin, config.train.epoch_end

    if arg.load_ckpt:
        if use_multi_gpu:
            begin, best = load_ckpt(Arch.module, optimizer, scheduler,
                                    output_dir, logger)
        else:
            begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir,
                                    logger)

    for epoch in range(begin, end):

        lr = scheduler.get_lr()[0]
        logger.info(
            '==>time:({})--training...... current learning rate is {:.7f}'.
            format(datetime.datetime.now(), lr))

        train(
            epoch,
            train_queue,
            arch_queue,
            Arch,
            Search,
            criterion,
            optimizer,
            lr,
            search_strategy,
            output_dir,
            logger,
            config,
            arg,
        )
        scheduler.step()

        eval_results = evaluate(Arch, valid_queue, config, output_dir)
        if use_multi_gpu:
            best = save_model(epoch, best, eval_results, Arch.module,
                              optimizer, scheduler, output_dir, logger)
        else:

            best = save_model(epoch, best, eval_results, Arch, optimizer,
                              scheduler, output_dir, logger)

        ## visualize_heatamp
        if arg.visualize and epoch % 5 == 0:
            for i in range(len(valid_queue.dataset)):

                if valid_queue.dataset[i][1] != 185250:  # choose an image_id
                    continue
                print(valid_queue.dataset[i][1])
                sample = valid_queue.dataset[i]

                img = sample[0].unsqueeze(0)
                #samples = next(iter(valid_dataloader))
                #img = samples[0]
                output = Arch(img)
                print(img.size(), output.size())
                visualize_heatamp(img, output, 'heatmaps', show_img=False)
                break