def experiment(args): track_local_dir = os.path.join(args.logroot, args.experimentname) if args.remote: track_remote_dir = os.path.join(args.remote, args.projectname, args.experimentname) else: track_remote_dir = None with track.trial(track_local_dir, track_remote_dir, param_map=vars(args)): track.debug("Starting trial") do_training(args)
def _main(_): with track.trial(os.getenv('TRACK_DIRECTORY'), param_map=track.absl_flags(), trial_prefix=flags.FLAGS.trial_prefix): seed_all(flags.FLAGS.seed) track.debug('found gpus {}', gpus()) dataset_file = os.path.join( flags.FLAGS.dataroot, 'wikisql', 'processed-toy{}.pth'.format(1 if flags.FLAGS.toy else 0)) track.debug('loading data from {}', dataset_file) train, val, _ = torch.load(dataset_file) track.debug('building model') model = wikisql_specific.WikiSQLSpecificModel(train.fields) track.debug('built model:\n{}', model) num_parameters = int( sum(p.numel() for p in model.parameters() if p.requires_grad)) track.debug('number of parameters in model {}', num_parameters) device = get_device() torch.save(model.to(torch.device('cpu')), os.path.join(track.trial_dir(), 'untrained_model.pth')) model = model.to(device) training_state = _TrainingState() if flags.FLAGS.restore_checkpoint: _copy_best_checkpoint(flags.FLAGS.restore_checkpoint) _load_checkpoint(flags.FLAGS.restore_checkpoint, model, training_state) params_to_optimize = [p for p in model.parameters() if p.requires_grad] if flags.FLAGS.optimizer == 'sgd': # lr required here but will be set in _do_training optimizer = optim.SGD(params_to_optimize, lr=1, weight_decay=flags.FLAGS.weight_decay) elif flags.FLAGS.optimizer == 'momentum': # lr required here but will be set in _do_training optimizer = optim.SGD(params_to_optimize, lr=1, momentum=0.9, weight_decay=flags.FLAGS.weight_decay) elif flags.FLAGS.optimizer == 'adam': optimizer = optim.Adam(params_to_optimize, weight_decay=flags.FLAGS.weight_decay) else: raise ValueError('unrecognized optimizer {}'.format( flags.FLAGS.optimizer)) num_workers = flags.FLAGS.workers track.debug('initializing {} workers', num_workers) with closing(SharedGPU(optimizer, model, num_workers)) as shared: _do_training(train, val, shared, training_state)
def _experiment(experiment_fn, args): """ Launches the track experiment (+/- S3 backup) by calling `experiment_fn(args)` where args contains the parsed arguments. """ track_local_dir = os.path.join(args.logroot, args.experimentname) if args.s3: track_remote_dir = os.path.join(args.s3, args.projectname, args.experimentname) else: track_remote_dir = None with track.trial(track_local_dir, track_remote_dir, param_map=vars(args)): track.debug("Starting experiment!") experiment_fn(args)
len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() return cur_loss # Loop over epochs. lr = args.lr best_val_loss = None # At any point you can hit Ctrl + C to break out of training early. param_map = {'batch_size': args.batch_size} with track.trial(args.logroot, None, param_map=param_map): try: for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train_loss = train() val_loss = evaluate(val_data) print('-' * 89) track.debug( '| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), train_loss, val_loss, math.exp(val_loss))) print('-' * 89) track.metric(iteration=epoch, train_loss=train_loss,
def main(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.sqrt_lr: lr = args.lr * math.sqrt(args.batch_size / 32.) else: lr = args.lr optimizer = torch.optim.SGD(model.parameters(), lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=min( args.batch_size, args.max_samples), shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.max_samples, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return with track.trial(args.logroot, None, param_map={'batch_size': args.batch_size}): for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train_loss = train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set with torch.no_grad(): val_loss, prec1 = validate(val_loader, model, criterion) track.metric(iteration=epoch, train_loss=train_loss, test_loss=val_loss, prec=prec1) # Log model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) # Save the model if the validation loss is the best we've seen so far. # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best: best_fname = os.path.join(track.trial_dir(), "best.ckpt") with open(best_fname, 'wb') as f: torch.save(model, f)
# Save checkpoint. acc = 100.0 * correct / total if acc > best_acc: print("Saving..") state = {"net": net.state_dict(), "acc": acc, "epoch": epoch} if not os.path.isdir("checkpoint"): os.mkdir("checkpoint") ckpt_path = os.path.join(track.trial_dir(), "ckpt.pth") torch.save(state, ckpt_path) best_acc = acc test_loss = test_loss / len(testloader) return test_loss, acc, best_acc with track.trial(args.logroot, None, param_map=vars(args)): for epoch in range(start_epoch, start_epoch + 200): train_loss, train_acc = train(epoch) test_loss, test_acc, best_acc = test(epoch) track.metric( iteration=epoch, train_loss=train_loss, train_acc=train_acc, test_loss=test_loss, test_acc=test_acc, best_acc=best_acc, ) track.debug( f"epoch {epoch} finished with stats: best_acc = {best_acc} | train_acc = {train_acc} | test_acc = {test_acc} | train_loss = {train_loss} | test_loss = {test_loss}" )