def main(args): t1 = dt.now() if args.outdir is not None and not os.path.exists(args.outdir): os.makedirs(args.outdir) LOG = f'{args.outdir}/run.log' def flog(msg): # HACK: switch to logging module return utils.flog(msg, LOG) if args.load == 'latest': args = get_latest(args, flog) flog(' '.join(sys.argv)) flog(args) # set the random seed np.random.seed(args.seed) torch.manual_seed(args.seed) ## set the device use_cuda = torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') flog('Use cuda {}'.format(use_cuda)) if use_cuda: torch.set_default_tensor_type(torch.cuda.FloatTensor) else: flog('WARNING: No GPUs detected') # load the particles if args.ind is not None: flog('Filtering image dataset with {}'.format(args.ind)) ind = pickle.load(open(args.ind, 'rb')) else: ind = None if args.lazy: data = dataset.LazyMRCData(args.particles, norm=args.norm, invert_data=args.invert_data, ind=ind, window=args.window, datadir=args.datadir, relion31=args.relion31) else: data = dataset.MRCData(args.particles, norm=args.norm, invert_data=args.invert_data, ind=ind, window=args.window, datadir=args.datadir, relion31=args.relion31) D = data.D Nimg = data.N # instantiate model if args.pe_type != 'none': assert args.l_extent == 0.5 lattice = Lattice(D, extent=args.l_extent) activation = {"relu": nn.ReLU, "leaky_relu": nn.LeakyReLU}[args.activation] model = models.get_decoder(3, D, args.layers, args.dim, args.domain, args.pe_type, enc_dim=args.pe_dim, activation=activation) flog(model) flog('{} parameters in model'.format( sum(p.numel() for p in model.parameters() if p.requires_grad))) # optimizer optim = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) # load weights if args.load: flog('Loading model weights from {}'.format(args.load)) checkpoint = torch.load(args.load) model.load_state_dict(checkpoint['model_state_dict']) optim.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 assert start_epoch < args.num_epochs else: start_epoch = 0 # load poses if args.do_pose_sgd: assert args.domain == 'hartley', "Need to use --domain hartley if doing pose SGD" posetracker = PoseTracker.load(args.poses, Nimg, D, args.emb_type, ind) pose_optimizer = torch.optim.SparseAdam(posetracker.parameters(), lr=args.pose_lr) else: posetracker = PoseTracker.load(args.poses, Nimg, D, None, ind) # load CTF if args.ctf is not None: flog('Loading ctf params from {}'.format(args.ctf)) ctf_params = ctf.load_ctf_for_training(D - 1, args.ctf) if args.ind is not None: ctf_params = ctf_params[ind] ctf_params = torch.tensor(ctf_params) else: ctf_params = None Apix = ctf_params[0, 0] if ctf_params is not None else 1 # save configuration out_config = f'{args.outdir}/config.pkl' save_config(args, data, lattice, model, out_config) # Mixed precision training with AMP if args.amp: assert args.batch_size % 8 == 0 assert (D - 1) % 8 == 0 assert args.dim % 8 == 0 # Also check zdim, enc_mask dim? model, optim = amp.initialize(model, optim, opt_level='O1') # parallelize if args.multigpu and torch.cuda.device_count() > 1: flog(f'Using {torch.cuda.device_count()} GPUs!') args.batch_size *= torch.cuda.device_count() flog(f'Increasing batch size to {args.batch_size}') model = nn.DataParallel(model) elif args.multigpu: flog( f'WARNING: --multigpu selected, but {torch.cuda.device_count()} GPUs detected' ) # train data_generator = DataLoader(data, batch_size=args.batch_size, shuffle=True) for epoch in range(start_epoch, args.num_epochs): t2 = dt.now() loss_accum = 0 batch_it = 0 for batch, ind in data_generator: batch_it += len(ind) y = batch.to(device) ind = ind.to(device) if args.do_pose_sgd: pose_optimizer.zero_grad() r, t = posetracker.get_pose(ind) c = ctf_params[ind] if ctf_params is not None else None loss_item = train(model, lattice, optim, batch.to(device), r, t, c, use_amp=args.amp) if args.do_pose_sgd and epoch >= args.pretrain: pose_optimizer.step() loss_accum += loss_item * len(ind) if batch_it % args.log_interval == 0: flog( '# [Train Epoch: {}/{}] [{}/{} images] loss={:.6f}'.format( epoch + 1, args.num_epochs, batch_it, Nimg, loss_item)) flog('# =====> Epoch: {} Average loss = {:.6}; Finished in {}'.format( epoch + 1, loss_accum / Nimg, dt.now() - t2)) if args.checkpoint and epoch % args.checkpoint == 0: out_mrc = '{}/reconstruct.{}.mrc'.format(args.outdir, epoch) out_weights = '{}/weights.{}.pkl'.format(args.outdir, epoch) save_checkpoint(model, lattice, optim, epoch, data.norm, Apix, out_mrc, out_weights) if args.do_pose_sgd and epoch >= args.pretrain: out_pose = '{}/pose.{}.pkl'.format(args.outdir, epoch) posetracker.save(out_pose) ## save model weights and evaluate the model on 3D lattice out_mrc = '{}/reconstruct.mrc'.format(args.outdir) out_weights = '{}/weights.pkl'.format(args.outdir) save_checkpoint(model, lattice, optim, epoch, data.norm, Apix, out_mrc, out_weights) if args.do_pose_sgd and epoch >= args.pretrain: out_pose = '{}/pose.pkl'.format(args.outdir) posetracker.save(out_pose) td = dt.now() - t1 flog('Finsihed in {} ({} per epoch)'.format( td, td / (args.num_epochs - start_epoch)))
def main(args): log(args) t1 = dt.now() if not os.path.exists(args.outdir): os.makedirs(args.outdir) # set the random seed np.random.seed(args.seed) torch.manual_seed(args.seed) ## set the device use_cuda = torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') log('Use cuda {}'.format(use_cuda)) if use_cuda: torch.set_default_tensor_type(torch.cuda.FloatTensor) # load the particles if args.ind is not None: log('Filtering image dataset with {}'.format(args.ind)) ind = pickle.load(open(args.ind, 'rb')) else: ind = None if args.lazy: data = dataset.LazyMRCData(args.particles, norm=args.norm, invert_data=args.invert_data, ind=ind, window=args.window, datadir=args.datadir) else: data = dataset.MRCData(args.particles, norm=args.norm, invert_data=args.invert_data, ind=ind, window=args.window, datadir=args.datadir) D = data.D Nimg = data.N # instantiate model if args.pe_type != 'none': assert args.l_extent == 0.5 lattice = Lattice(D, extent=args.l_extent) model = models.get_decoder(3, D, args.layers, args.dim, args.domain, args.pe_type, nn.ReLU) log(model) log('{} parameters in model'.format( sum(p.numel() for p in model.parameters() if p.requires_grad))) # optimizer optim = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) # load weights if args.load: log('Loading model weights from {}'.format(args.load)) checkpoint = torch.load(args.load) model.load_state_dict(checkpoint['model_state_dict']) optim.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 assert start_epoch < args.num_epochs else: start_epoch = 0 # load poses if args.do_pose_sgd: posetracker = PoseTracker.load(args.poses, Nimg, D, args.emb_type, ind) pose_optimizer = torch.optim.SparseAdam(posetracker.parameters(), lr=args.pose_lr) else: posetracker = PoseTracker.load(args.poses, Nimg, D, None, ind) # load CTF if args.ctf is not None: log('Loading ctf params from {}'.format(args.ctf)) ctf_params = ctf.load_ctf_for_training(D - 1, args.ctf) if args.ind is not None: ctf_params = ctf_params[ind] ctf_params = torch.tensor(ctf_params) else: ctf_params = None Apix = ctf_params[0, 0] if ctf_params is not None else 1 # train data_generator = DataLoader(data, batch_size=args.batch_size, shuffle=True) for epoch in range(start_epoch, args.num_epochs): t2 = dt.now() loss_accum = 0 batch_it = 0 for batch, ind in data_generator: batch_it += len(ind) y = batch.to(device) ind = ind.to(device) if args.do_pose_sgd: pose_optimizer.zero_grad() r, t = posetracker.get_pose(ind) c = ctf_params[ind] if ctf_params is not None else None loss_item = train(model, lattice, optim, batch.to(device), r, t, c) if args.do_pose_sgd and epoch >= args.pretrain: pose_optimizer.step() loss_accum += loss_item * len(ind) if batch_it % args.log_interval == 0: log('# [Train Epoch: {}/{}] [{}/{} images] loss={:.6f}'.format( epoch + 1, args.num_epochs, batch_it, Nimg, loss_item)) log('# =====> Epoch: {} Average loss = {:.6}; Finished in {}'.format( epoch + 1, loss_accum / Nimg, dt.now() - t2)) if args.checkpoint and epoch % args.checkpoint == 0: out_mrc = '{}/reconstruct.{}.mrc'.format(args.outdir, epoch) out_weights = '{}/weights.{}.pkl'.format(args.outdir, epoch) save_checkpoint(model, lattice, optim, epoch, data.norm, Apix, out_mrc, out_weights) if args.do_pose_sgd and epoch >= args.pretrain: out_pose = '{}/pose.{}.pkl'.format(args.outdir, epoch) posetracker.save(out_pose) ## save model weights and evaluate the model on 3D lattice out_mrc = '{}/reconstruct.mrc'.format(args.outdir) out_weights = '{}/weights.pkl'.format(args.outdir) save_checkpoint(model, lattice, optim, epoch, data.norm, Apix, out_mrc, out_weights) if args.do_pose_sgd and epoch >= args.pretrain: out_pose = '{}/pose.pkl'.format(args.outdir) posetracker.save(out_pose) td = dt.now() - t1 log('Finsihed in {} ({} per epoch)'.format( td, td / (args.num_epochs - start_epoch)))