Beispiel #1
0
def experiment(args):
    track_local_dir = os.path.join(args.logroot, args.experimentname)
    if args.remote:
        track_remote_dir = os.path.join(args.remote, args.projectname,
                                        args.experimentname)
    else:
        track_remote_dir = None
    with track.trial(track_local_dir, track_remote_dir, param_map=vars(args)):
        track.debug("Starting trial")
        do_training(args)
Beispiel #2
0
def _main(_):
    with track.trial(os.getenv('TRACK_DIRECTORY'),
                     param_map=track.absl_flags(),
                     trial_prefix=flags.FLAGS.trial_prefix):
        seed_all(flags.FLAGS.seed)
        track.debug('found gpus {}', gpus())

        dataset_file = os.path.join(
            flags.FLAGS.dataroot, 'wikisql',
            'processed-toy{}.pth'.format(1 if flags.FLAGS.toy else 0))
        track.debug('loading data from {}', dataset_file)
        train, val, _ = torch.load(dataset_file)

        track.debug('building model')
        model = wikisql_specific.WikiSQLSpecificModel(train.fields)
        track.debug('built model:\n{}', model)
        num_parameters = int(
            sum(p.numel() for p in model.parameters() if p.requires_grad))
        track.debug('number of parameters in model {}', num_parameters)

        device = get_device()
        torch.save(model.to(torch.device('cpu')),
                   os.path.join(track.trial_dir(), 'untrained_model.pth'))
        model = model.to(device)
        training_state = _TrainingState()
        if flags.FLAGS.restore_checkpoint:
            _copy_best_checkpoint(flags.FLAGS.restore_checkpoint)
            _load_checkpoint(flags.FLAGS.restore_checkpoint, model,
                             training_state)
        params_to_optimize = [p for p in model.parameters() if p.requires_grad]
        if flags.FLAGS.optimizer == 'sgd':
            # lr required here but will be set in _do_training
            optimizer = optim.SGD(params_to_optimize,
                                  lr=1,
                                  weight_decay=flags.FLAGS.weight_decay)
        elif flags.FLAGS.optimizer == 'momentum':
            # lr required here but will be set in _do_training
            optimizer = optim.SGD(params_to_optimize,
                                  lr=1,
                                  momentum=0.9,
                                  weight_decay=flags.FLAGS.weight_decay)
        elif flags.FLAGS.optimizer == 'adam':
            optimizer = optim.Adam(params_to_optimize,
                                   weight_decay=flags.FLAGS.weight_decay)
        else:
            raise ValueError('unrecognized optimizer {}'.format(
                flags.FLAGS.optimizer))

        num_workers = flags.FLAGS.workers
        track.debug('initializing {} workers', num_workers)
        with closing(SharedGPU(optimizer, model, num_workers)) as shared:
            _do_training(train, val, shared, training_state)
Beispiel #3
0
def _experiment(experiment_fn, args):
    """
    Launches the track experiment (+/- S3 backup) by calling
    `experiment_fn(args)` where args contains the parsed arguments.
    """
    track_local_dir = os.path.join(args.logroot, args.experimentname)
    if args.s3:
        track_remote_dir = os.path.join(args.s3, args.projectname,
                                        args.experimentname)
    else:
        track_remote_dir = None
    with track.trial(track_local_dir, track_remote_dir, param_map=vars(args)):
        track.debug("Starting experiment!")
        experiment_fn(args)
Beispiel #4
0
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
    return cur_loss


# Loop over epochs.
lr = args.lr
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
param_map = {'batch_size': args.batch_size}

with track.trial(args.logroot, None, param_map=param_map):
    try:
        for epoch in range(1, args.epochs + 1):
            epoch_start_time = time.time()
            train_loss = train()
            val_loss = evaluate(val_data)
            print('-' * 89)
            track.debug(
                '| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch,
                                           (time.time() - epoch_start_time),
                                           train_loss, val_loss,
                                           math.exp(val_loss)))
            print('-' * 89)
            track.metric(iteration=epoch,
                         train_loss=train_loss,
Beispiel #5
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    if args.sqrt_lr:
        lr = args.lr * math.sqrt(args.batch_size / 32.)
    else:
        lr = args.lr

    optimizer = torch.optim.SGD(model.parameters(),
                                lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=min(
                                                   args.batch_size,
                                                   args.max_samples),
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.max_samples,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    with track.trial(args.logroot,
                     None,
                     param_map={'batch_size': args.batch_size}):
        for epoch in range(args.start_epoch, args.epochs):
            if args.distributed:
                train_sampler.set_epoch(epoch)
            adjust_learning_rate(optimizer, epoch)

            # train for one epoch
            train_loss = train(train_loader, model, criterion, optimizer,
                               epoch)

            # evaluate on validation set
            with torch.no_grad():
                val_loss, prec1 = validate(val_loader, model, criterion)

            track.metric(iteration=epoch,
                         train_loss=train_loss,
                         test_loss=val_loss,
                         prec=prec1)
            # Log model
            model_fname = os.path.join(track.trial_dir(),
                                       "model{}.ckpt".format(epoch))
            torch.save(model, model_fname)

            # Save the model if the validation loss is the best we've seen so far.
            # remember best prec@1 and save checkpoint
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            if is_best:
                best_fname = os.path.join(track.trial_dir(), "best.ckpt")
                with open(best_fname, 'wb') as f:
                    torch.save(model, f)
Beispiel #6
0
    # Save checkpoint.
    acc = 100.0 * correct / total
    if acc > best_acc:
        print("Saving..")
        state = {"net": net.state_dict(), "acc": acc, "epoch": epoch}
        if not os.path.isdir("checkpoint"):
            os.mkdir("checkpoint")
        ckpt_path = os.path.join(track.trial_dir(), "ckpt.pth")
        torch.save(state, ckpt_path)
        best_acc = acc
    test_loss = test_loss / len(testloader)
    return test_loss, acc, best_acc


with track.trial(args.logroot, None, param_map=vars(args)):
    for epoch in range(start_epoch, start_epoch + 200):
        train_loss, train_acc = train(epoch)
        test_loss, test_acc, best_acc = test(epoch)
        track.metric(
            iteration=epoch,
            train_loss=train_loss,
            train_acc=train_acc,
            test_loss=test_loss,
            test_acc=test_acc,
            best_acc=best_acc,
        )
        track.debug(
            f"epoch {epoch} finished with stats: best_acc = {best_acc} | train_acc = {train_acc} | test_acc = {test_acc} | train_loss = {train_loss} | test_loss = {test_loss}"
        )