Ejemplo n.º 1
0
def simple_fit(model,
               loss_function,
               dataset,
               optimizer,
               epochs,
               lr=0.01,
               weight_decay=0,
               print_interval=1,
               **opt_kwargs):
    optimizer = get_optimizer(optimizer,
                              model,
                              lr=lr,
                              weight_decay=weight_decay,
                              **opt_kwargs)

    iterations = 1
    model.train()
    for epoch_index in range(1, 1 + epochs):
        for data_index, data in enumerate(dataset):
            optimizer.zero_grad()
            loss, monitors = loss_function(model, data)
            loss.backward()
            optimizer.step()
            if iterations % print_interval == 0:
                logger.info(
                    f'Epoch {epoch_index} Index {data_index} (Iteration {iterations}): loss = {loss.item():.4f}, monitors={monitors}.'
                )
            iterations += 1
Ejemplo n.º 2
0
def main(run_id):
    if args.dump_dir is not None:
        if args.runs > 1:
            args.current_dump_dir = os.path.join(args.dump_dir,
                                                 'run_{}'.format(run_id))
            io.mkdir(args.current_dump_dir)
        else:
            args.current_dump_dir = args.dump_dir
        args.checkpoints_dir = os.path.join(args.current_dump_dir, 'checkpoints')
        io.mkdir(args.checkpoints_dir)
        args.summary_file = os.path.join(args.current_dump_dir, 'summary.json')

    logger.info(format_args(args))

    model = Model()
    optimizer = get_optimizer(args.optimizer, model, args.lr)
    if args.accum_grad > 1:
        optimizer = AccumGrad(optimizer, args.accum_grad)

    trainer = MyTrainer.from_args(model, optimizer, args)

    if args.load_checkpoint is not None:
        trainer.load_checkpoint(args.load_checkpoint)

    if args.test_only:
        trainer.current_epoch = 0
        return None, trainer.test()

    graduated = trainer.train()
    trainer.save_checkpoint('last')
    test_meters = trainer.test() if graduated or args.test_not_graduated else None
    return graduated, test_meters
Ejemplo n.º 3
0
def main(run_id):
    if args.dump_dir is not None:
        if args.runs > 1:
            args.current_dump_dir = os.path.join(args.dump_dir,
                                                 'run_{}'.format(run_id))
            io.mkdir(args.current_dump_dir)
        else:
            args.current_dump_dir = args.dump_dir

        args.summary_file = os.path.join(args.current_dump_dir, 'summary.json')
        args.checkpoints_dir = os.path.join(args.current_dump_dir,
                                            'checkpoints')
        io.mkdir(args.checkpoints_dir)

    logger.info(format_args(args))

    model = Model()
    if args.use_gpu:
        model.cuda()
    optimizer = get_optimizer(args.optimizer, model, args.lr)
    if args.accum_grad > 1:
        optimizer = AccumGrad(optimizer, args.accum_grad)
    trainer = MyTrainer.from_args(model, optimizer, args)

    if args.load_checkpoint is not None:
        trainer.load_checkpoint(args.load_checkpoint)

    if args.test_only:
        return None, trainer.test()

    final_meters = trainer.train()
    trainer.save_checkpoint('last')

    return trainer.early_stopped, trainer.test()
Ejemplo n.º 4
0
 def __init__(self,
              model,
              optimizer,
              lr=0.01,
              weight_decay=0,
              **opt_kwargs):
     optimizer = get_optimizer(optimizer,
                               model,
                               lr=lr,
                               weight_decay=weight_decay,
                               **opt_kwargs)
     self._model = model
     self._optimizer = optimizer
Ejemplo n.º 5
0
def main():
    if args.dump_dir is not None:
        args.current_dump_dir = args.dump_dir

        args.summary_file = os.path.join(args.current_dump_dir, 'summary.json')
        args.checkpoints_dir = os.path.join(args.current_dump_dir,
                                            'checkpoints')
        io.mkdir(args.checkpoints_dir)

    exp_fh = open(os.path.join(args.current_dump_dir, 'exp.sh'), 'a')
    print('jac-run {}'.format(' '.join(sys.argv)), file=exp_fh)
    exp_fh.close()

    logger.info('jac-run {}'.format(' '.join(sys.argv)))
    logger.info(format_args(args))
    print(args.solution_count)
    model = models.get_model(args)

    if args.use_gpu:
        model.cuda()

    optimizer = get_optimizer(args.optimizer,
                              model,
                              args.lr,
                              weight_decay=args.wt_decay)

    trainer = MyTrainer.from_args(model, optimizer, args)
    trainer.num_iters = 0
    trainer.num_bad_updates = 0
    trainer.test_batch_size = args.test_batch_size
    trainer.mode = 'warmup'
    trainer.checkpoint_mode = "warmup"
    trainer._latent_model = None
    trainer._static_model = None

    skip_warmup = False
    if args.load_checkpoint is not None:
        extra = trainer.load_checkpoint(args.load_checkpoint)
        #skip_warmup = extra is not None and (extra['name'] == 'last_warmup')
        skip_warmup = args.skip_warmup

    my_lr_scheduler = scheduler.CustomReduceLROnPlateau(
        trainer._optimizer, {
            'mode': 'min',
            'factor': 0.2,
            'patience': math.ceil(7 / args.test_interval),
            'verbose': True,
            'threshold': 0.0001,
            'threshold_mode': 'rel',
            'cooldown': 0,
            'min_lr': 0.01 * args.lr,
            'eps': 0.0000001
        },
        maxPatienceToStopTraining=math.ceil(20 / args.test_interval))

    trainer.my_lr_scheduler = my_lr_scheduler

    if args.test_only:
        #
        # trainer.load_latent_samples(os.path.join(
        # args.current_dump_dir, "latent_z_samples.pkl"))
        trainer.pred_dump = []
        trainer.reset_test()
        rv = trainer.test()
        #with open(os.path.join(args.current_dump_dir, "pred_dump.pkl"), "wb") as f:
        #    pickle.dump(trainer.pred_dump, f)
        trainer.dump_errors(force=True)
        with open(os.path.join(args.current_dump_dir, 'results.out'),
                  "w") as f:
            print(rv[0].avg['corrected accuracy'], file=f)

        test_at_end(trainer)
        return None, rv

    if not skip_warmup:
        warmup_meters, warmup_test_meters = trainer.train(
            1, args.warmup_epochs)
        trainer.save_checkpoint('last_warmup')
    else:
        logger.info("Skipping warmup")

    if args.epochs > 0:
        # define latent model
        # clone the main model
        # set the optimizer
        if skip_warmup:
            trainer._prepare_dataset(args.epoch_size, 'train')
        #
        trainer.checkpoint_mode = "hot"
        trainer.best_accuracy = -1
        args.min_loss = 0

        trainer._latent_model = models.get_latent_model(args, trainer.model)
        trainer._latent_model.train()
        if not args.no_static:
            trainer._static_model = copy.deepcopy(trainer._model)
        trainer._latent_optimizer = get_optimizer(
            args.optimizer,
            trainer._latent_model,
            args.lr_latent,
            weight_decay=args.latent_wt_decay)

        trainer.mode = "hot"

        # switch off training mode only after pretraining phi
        # since pretraining phi requires training statistics
        if not args.no_static:
            trainer._static_model.eval()
            #trainer._static_model.training = True
        #
        # if skip_warmup:
        #    extra = trainer.load_checkpoint(args.load_checkpoint)
        trainer.datasets['train'].reset_sampler(args.hot_data_sampling)
        #trainer.datasets["train"].data_sampling = args.hot_data_sampling

        if not args.no_static:
            trainer._static_model.train()
        if args.pretrain_phi > 0:
            my_lr_scheduler.maxPatienceToStopTraining = 10000
            for x in trainer._optimizer.param_groups:
                x['lr'] = 0.0
            _ = trainer.train(args.warmup_epochs + 1, args.pretrain_phi)

        trainer.best_accuracy = -1

        trainer._optimizer = get_optimizer(args.optimizer,
                                           trainer.model,
                                           args.lr_hot,
                                           weight_decay=args.wt_decay)

        my_lr_scheduler = scheduler.CustomReduceLROnPlateau(
            trainer._optimizer, {
                'mode': 'min',
                'factor': 0.2,
                'patience': math.ceil(7 / args.test_interval),
                'verbose': True,
                'threshold': 0.01,
                'threshold_mode': 'rel',
                'cooldown': 0,
                'min_lr': 0.01 * args.lr_hot,
                'eps': 0.0000001
            },
            maxPatienceToStopTraining=math.ceil(25 / args.test_interval))
        trainer.my_lr_scheduler = my_lr_scheduler

        final_meters = trainer.train(
            args.warmup_epochs + args.pretrain_phi + 1, args.epochs)
        trainer.save_checkpoint('last')

    trainer.load_checkpoint(
        os.path.join(args.checkpoints_dir, 'checkpoint_best.pth'))
    logger.info("Best Dev Accuracy: {}".format(trainer.best_accuracy))

    trainer.reset_test()
    ret = trainer.test()
    trainer.dump_errors(force=True)
    with open(os.path.join(args.current_dump_dir, 'results.out'), "w") as f:
        print(trainer.best_accuracy, ret[0].avg['corrected accuracy'], file=f)

    test_at_end(trainer)
    return ret