class CometMLMonitor(MonitorBase): """ Send data to https://www.comet.ml. Note: 1. comet_ml requires you to `import comet_ml` before importing tensorflow or tensorpack. 2. The "automatic output logging" feature of comet_ml will make the training progress bar appear to freeze. Therefore the feature is disabled by default. """ def __init__(self, experiment=None, api_key=None, tags=None, **kwargs): """ Args: experiment (comet_ml.Experiment): if provided, invalidate all other arguments api_key (str): your comet.ml API key tags (list[str]): experiment tags kwargs: other arguments passed to :class:`comet_ml.Experiment`. """ if experiment is not None: self._exp = experiment assert api_key is None and tags is None and len(kwargs) == 0 else: from comet_ml import Experiment kwargs.setdefault( 'log_code', True ) # though it's not functioning, git patch logging requires it kwargs.setdefault('auto_output_logging', None) self._exp = Experiment(api_key=api_key, **kwargs) if tags is not None: self._exp.add_tags(tags) self._exp.set_code( "Code logging is impossible because there are too many files ...") self._exp.log_dependency('tensorpack', __git_version__) @property def experiment(self): """ The :class:`comet_ml.Experiment` instance. """ return self._exp def _before_train(self): self._exp.set_model_graph(tf.get_default_graph()) @HIDE_DOC def process_scalar(self, name, val): self._exp.log_metric(name, val, step=self.global_step) def _after_train(self): self._exp.end() def _after_epoch(self): self._exp.log_epoch_end(self.epoch_num)
def main(): env = gym.make('CartPole-v1') model = PPO() score = 0.0 print_interval = 20 log = Log(__file__[:-3]) experiment = Experiment(api_key="F8yfdGljIExZoi73No4gb1gF5", project_name="reinforcement-learning", workspace="zombasy") experiment.set_model_graph(model) for n_epi in range(2000): s = env.reset() done = False epsilon = max(0.01, args.epsilon - 0.01 * (n_epi / 200)) while not done: for t in range(args.T_horizon): prob = model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() coin = random.random() if coin < epsilon: a = random.randint(0, 1) s_prime, r, done, info = env.step(a) model.put_data( (s, a, r / 100.0, s_prime, prob[a].item(), done)) s = s_prime score += r if done: break model.train_net() if n_epi % print_interval == 0 and n_epi != 0: log.info("episode :{}, avg score : {:.1f}".format( n_epi, score / print_interval)) experiment.log_metric('score', score / print_interval) experiment.log_metric('epsilon', epsilon) score = 0.0 if n_epi % 500 == 0 and n_epi != 0: save_model(model, 'ppo', n_epi, experiment) env.close()
def train(hyper_params): mnist = get_data() # Get graph definition, tensors and ops train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph( hyper_params) # log parameters to Comet.ml import os # Setting the API key (saved as environment variable) exp = Experiment( api_key="<HIDDEN>", # or # api_key=os.environ.get("COMET_API_KEY"), project_name="prototype", workspace="jaimemarijke") exp.log_parameters(hyper_params) exp.log_dataset_hash(mnist) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) exp.set_model_graph(sess.graph) for i in range(hyper_params["steps"]): batch = mnist.train.next_batch(hyper_params["batch_size"]) exp.set_step(i) # Compute train accuracy every 10 steps if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1] }) print('step %d, training accuracy %g' % (i, train_accuracy)) exp.log_metric("acc", train_accuracy) # Update weights (back propagation) loss = train_step.run(feed_dict={x: batch[0], y_: batch[1]}) exp.log_metric("loss", loss) ### Finished Training ### # Compute test accuracy acc = accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) print('test accuracy %g' % acc)
def train(hyper_params): mnist = get_data() # Get graph definition, tensors and ops train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph( hyper_params) experiment = Experiment(project_name="tf") experiment.log_parameters(hyper_params) experiment.log_dataset_hash(mnist) with tf.Session() as sess: with experiment.train(): sess.run(tf.global_variables_initializer()) experiment.set_model_graph(sess.graph) for i in range(hyper_params["steps"]): batch = mnist.train.next_batch(hyper_params["batch_size"]) experiment.set_step(i) # Compute train accuracy every 10 steps if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1] }) print('step %d, training accuracy %g' % (i, train_accuracy)) experiment.log_metric("accuracy", train_accuracy, step=i) # Update weights (back propagation) _, loss_val = sess.run([train_step, cross_entropy], feed_dict={ x: batch[0], y_: batch[1] }) experiment.log_metric("loss", loss_val, step=i) ### Finished Training ### with experiment.test(): # Compute test accuracy acc = accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) experiment.log_metric("accuracy", acc) print('test accuracy %g' % acc)
class CometMlAdapter(BaseAdapter): def __init__(self, api_key, project_name, experiment_name): self.experiment = Experiment(api_key=api_key, project_name=project_name) self.experiment.set_name(experiment_name) def log_parameters(self, hyper_params): self.experiment.log_parameters(hyper_params) def set_model_graph(self, graph): self.experiment.set_model_graph(graph) def log_metric(self, name, metric, step): self.experiment.log_metric(name, metric, step=step) def register(self, name): pass
def train(hyper_params): mnist = get_data() # Get graph definition, tensors and ops train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph( hyper_params) #log parameters to Comet.ml exp = Experiment(api_key="YOUR-API-KEY", project_name='tensorflow examples') exp.log_multiple_params(hyper_params) exp.log_dataset_hash(mnist) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) exp.set_model_graph(sess.graph) for i in range(hyper_params["steps"]): batch = mnist.train.next_batch(hyper_params["batch_size"]) exp.set_step(i) # Compute train accuracy every 10 steps if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1] }) print('step %d, training accuracy %g' % (i, train_accuracy)) exp.log_metric("acc", train_accuracy) # Update weights (back propagation) loss = train_step.run(feed_dict={x: batch[0], y_: batch[1]}) exp.log_metric("loss", loss) ### Finished Training ### # Compute test accuracy acc = accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) print('test accuracy %g' % acc)
self.hidden_size)) c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # Forward propagate RNN out, _ = self.lstm(x, (h0, c0)) # Decode hidden state of last time step out = self.fc(out[:, -1, :]) return out rnn = RNN(hyper_params['input_size'], hyper_params['hidden_size'], hyper_params['num_layers'], hyper_params['num_classes']) experiment.set_model_graph(str(rnn)) # Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(rnn.parameters(), lr=hyper_params['learning_rate']) def train_index_to_example(index): tmp, _ = train_dataset[index] img = tmp.numpy()[0] data = experiment.log_image(img, name="train_%d.png" % index) return {"sample": str(index), "assetId": data["imageId"]} def test_index_to_example(index):
def train(rank, defparams, hyper): params = {} for param in defparams.keys(): params[param] = defparams[param] hyperp = {} for hp in hyper.keys(): hyperp[hp] = hyper[hp] experiment = Experiment(api_key="keGmeIz4GfKlQZlOP6cit4QOi", project_name="hadron-shower", workspace="engineren") experiment.add_tag(params['exp']) experiment.log_parameters(hyperp) device = torch.device("cuda") torch.manual_seed(params["seed"]) world_size = int(os.environ["SLURM_NNODES"]) rank = int(os.environ["SLURM_PROCID"]) dist.init_process_group(backend='nccl', world_size=world_size, rank=rank, init_method=params["DDP_init_file"]) aD = DCGAN_D(hyperp["ndf"]).to(device) aG = DCGAN_G(hyperp["ngf"], hyperp["z"]).to(device) aE = energyRegressor().to(device) aP = PostProcess_Size1Conv_EcondV2(48, 13, 3, 128, bias=True, out_funct='none').to(device) optimizer_g = torch.optim.Adam(aG.parameters(), lr=hyperp["L_gen"], betas=(0.5, 0.9)) optimizer_d = torch.optim.Adam(aD.parameters(), lr=hyperp["L_crit"], betas=(0.5, 0.9)) optimizer_e = torch.optim.SGD(aE.parameters(), lr=hyperp["L_calib"]) optimizer_p = torch.optim.Adam(aP.parameters(), lr=hyperp["L_post"], betas=(0.5, 0.9)) assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." torch.backends.cudnn.benchmark = True # Initialize Amp models, optimizers = amp.initialize([aG, aD], [optimizer_g, optimizer_d], opt_level="O1", num_losses=2) #aD = nn.DataParallel(aD) #aG = nn.DataParallel(aG) #aE = nn.DataParallel(aE) aG, aD = models optimizer_g, optimizer_d = optimizers aG = nn.parallel.DistributedDataParallel(aG, device_ids=[0]) aD = nn.parallel.DistributedDataParallel(aD, device_ids=[0]) aE = nn.parallel.DistributedDataParallel(aE, device_ids=[0]) aP = nn.parallel.DistributedDataParallel(aP, device_ids=[0]) experiment.set_model_graph(str(aG), overwrite=False) experiment.set_model_graph(str(aD), overwrite=False) if params["restore_pp"]: aP.load_state_dict( torch.load(params["restore_path_PP"] + params["post_saved"], map_location=torch.device(device))) if params["restore"]: checkpoint = torch.load(params["restore_path"]) aG.load_state_dict(checkpoint['Generator']) aD.load_state_dict(checkpoint['Critic']) optimizer_g.load_state_dict(checkpoint['G_optimizer']) optimizer_d.load_state_dict(checkpoint['D_optimizer']) itr = checkpoint['iteration'] else: aG.apply(weights_init) aD.apply(weights_init) itr = 0 if params["c0"]: aE.apply(weights_init) elif params["c1"]: aE.load_state_dict( torch.load(params["calib_saved"], map_location=torch.device(device))) one = torch.tensor(1.0).to(device) mone = (one * -1).to(device) print('loading data...') paths_list = [ '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part1.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part2.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part3.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part4.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part5.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part6.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part7.hdf5' ] train_data = PionsDataset(paths_list, core=True) train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, num_replicas=world_size, rank=rank) dataloader = DataLoader(train_data, batch_size=hyperp["batch_size"], num_workers=0, shuffle=False, drop_last=True, pin_memory=True, sampler=train_sampler) print('done') #scheduler_g = optim.lr_scheduler.StepLR(optimizer_g, step_size=1, gamma=params["gamma_g"]) #scheduler_d = optim.lr_scheduler.StepLR(optimizer_d, step_size=1, gamma=params["gamma_crit"]) #scheduler_e = optim.lr_scheduler.StepLR(optimizer_e, step_size=1, gamma=params["gamma_calib"]) #writer = SummaryWriter() e_criterion = nn.L1Loss() # for energy regressor training dataiter = iter(dataloader) BATCH_SIZE = hyperp["batch_size"] LATENT = hyperp["z"] EXP = params["exp"] KAPPA = hyperp["kappa"] LAMBD = hyperp["lambda"] ## Post-Processing LDP = hyperp["LDP"] wMMD = hyperp["wMMD"] wMSE = hyperp["wMSE"] ## IO paths OUTP = params['output_path'] for iteration in range(50000): iteration += itr + 1 #---------------------TRAIN D------------------------ for p in aD.parameters(): # reset requires_grad p.requires_grad_(True) # they are set to False below in training G for e in aE.parameters(): # reset requires_grad (constrainer) e.requires_grad_(True) # they are set to False below in training G for i in range(hyperp["ncrit"]): aD.zero_grad() aE.zero_grad() noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view( -1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch['energy'] ## energy label real_label = real_label.to(device) with torch.no_grad(): noisev = noise # totally freeze G, training D fake_data = aG(noisev, real_label).detach() real_data = batch['shower'] # 48x48x48 calo image real_data = real_data.to(device) real_data.requires_grad_(True) #### supervised-training for energy regressor! if params["train_calib"]: output = aE(real_data.float()) e_loss = e_criterion(output, real_label.view(BATCH_SIZE, 1)) e_loss.backward() optimizer_e.step() ###### # train with real data disc_real = aD(real_data.float(), real_label.float()) # train with fake data fake_data = fake_data.unsqueeze( 1) ## transform to [BS, 1, 48, 48, 48] disc_fake = aD(fake_data, real_label.float()) # train with interpolated data gradient_penalty = calc_gradient_penalty(aD, real_data.float(), fake_data, real_label, BATCH_SIZE, device, DIM=13) ## wasserstein-1 distace w_dist = torch.mean(disc_fake) - torch.mean(disc_real) # final disc cost disc_cost = torch.mean(disc_fake) - torch.mean( disc_real) + LAMBD * gradient_penalty with amp.scale_loss(disc_cost, optimizer_d) as scaled_loss: scaled_loss.backward() optimizer_d.step() #--------------Log to COMET ML ---------- if i == hyperp["ncrit"] - 1: experiment.log_metric("L_crit", disc_cost, step=iteration) experiment.log_metric("gradient_pen", gradient_penalty, step=iteration) experiment.log_metric("Wasserstein Dist", w_dist, step=iteration) if params["train_calib"]: experiment.log_metric("L_const", e_loss, step=iteration) #---------------------TRAIN G------------------------ for p in aD.parameters(): p.requires_grad_(False) # freeze D for c in aE.parameters(): c.requires_grad_(False) # freeze C gen_cost = None for i in range(hyperp["ngen"]): aG.zero_grad() noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view( -1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch['energy'] ## energy label real_label = real_label.to(device) noise.requires_grad_(True) real_data = batch['shower'] # 48x48x48 calo image real_data = real_data.to(device) fake_data = aG(noise, real_label.float()) fake_data = fake_data.unsqueeze( 1) ## transform to [BS, 1, 48, 48, 48] ## calculate loss function gen_cost = aD(fake_data.float(), real_label.float()) ## label conditioning #output_g = aE(fake_data) #output_r = aE(real_data.float()) output_g = 0.0 #for now output_r = 0.0 #for now aux_fake = (output_g - real_label)**2 aux_real = (output_r - real_label)**2 aux_errG = torch.abs(aux_fake - aux_real) ## Total loss function for generator g_cost = -torch.mean(gen_cost) + KAPPA * torch.mean(aux_errG) with amp.scale_loss(g_cost, optimizer_g) as scaled_loss_G: scaled_loss_G.backward() optimizer_g.step() #--------------Log to COMET ML ---------- experiment.log_metric("L_Gen", g_cost, step=iteration) ## plot example image if iteration % 100 == 0.0 or iteration == 1: image = fake_data.view(-1, 48, 13, 13).cpu().detach().numpy() cmap = mpl.cm.viridis cmap.set_bad('white', 1.) figExIm = plt.figure(figsize=(6, 6)) axExIm1 = figExIm.add_subplot(1, 1, 1) image1 = np.sum(image[0], axis=0) masked_array1 = np.ma.array(image1, mask=(image1 == 0.0)) im1 = axExIm1.imshow(masked_array1, filternorm=False, interpolation='none', cmap=cmap, vmin=0.01, vmax=100, norm=mpl.colors.LogNorm(), origin='lower') figExIm.patch.set_facecolor('white') axExIm1.set_xlabel('y [cells]', family='serif') axExIm1.set_ylabel('x [cells]', family='serif') figExIm.colorbar(im1) experiment.log_figure(figure=plt, figure_name="x-y") figExIm = plt.figure(figsize=(6, 6)) axExIm2 = figExIm.add_subplot(1, 1, 1) image2 = np.sum(image[0], axis=1) masked_array2 = np.ma.array(image2, mask=(image2 == 0.0)) im2 = axExIm2.imshow(masked_array2, filternorm=False, interpolation='none', cmap=cmap, vmin=0.01, vmax=100, norm=mpl.colors.LogNorm(), origin='lower') figExIm.patch.set_facecolor('white') axExIm2.set_xlabel('y [cells]', family='serif') axExIm2.set_ylabel('z [layers]', family='serif') figExIm.colorbar(im2) experiment.log_figure(figure=plt, figure_name="y-z") figExIm = plt.figure(figsize=(6, 6)) axExIm3 = figExIm.add_subplot(1, 1, 1) image3 = np.sum(image[0], axis=2) masked_array3 = np.ma.array(image3, mask=(image3 == 0.0)) im3 = axExIm3.imshow(masked_array3, filternorm=False, interpolation='none', cmap=cmap, vmin=0.01, vmax=100, norm=mpl.colors.LogNorm(), origin='lower') figExIm.patch.set_facecolor('white') axExIm3.set_xlabel('x [cells]', family='serif') axExIm3.set_ylabel('z [layers]', family='serif') figExIm.colorbar(im3) #experiment.log_metric("L_aux", aux_errG, step=iteration) experiment.log_figure(figure=plt, figure_name="x-z") ## E-sum monitoring figEsum = plt.figure(figsize=(6, 6 * 0.77 / 0.67)) axEsum = figEsum.add_subplot(1, 1, 1) etot_real = getTotE(real_data.cpu().detach().numpy(), xbins=13, ybins=13) etot_fake = getTotE(image, xbins=13, ybins=13) axEsumReal = axEsum.hist(etot_real, bins=25, range=[0, 1500], weights=np.ones_like(etot_real) / (float(len(etot_real))), label="orig", color='blue', histtype='stepfilled') axEsumFake = axEsum.hist(etot_fake, bins=25, range=[0, 1500], weights=np.ones_like(etot_fake) / (float(len(etot_fake))), label="generated", color='red', histtype='stepfilled') axEsum.text(0.25, 0.81, "WGAN", horizontalalignment='left', verticalalignment='top', transform=axEsum.transAxes, color='red') axEsum.text(0.25, 0.87, 'GEANT 4', horizontalalignment='left', verticalalignment='top', transform=axEsum.transAxes, color='blue') experiment.log_figure(figure=plt, figure_name="E-sum") #end = timer() #print(f'---train G elapsed time: {end - start}') if params["train_postP"]: #---------------------TRAIN P------------------------ for p in aD.parameters(): p.requires_grad_(False) # freeze D for c in aG.parameters(): c.requires_grad_(False) # freeze G lossP = None for i in range(1): noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view( -1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch['energy'] ## energy label real_label = real_label.to(device) noise.requires_grad_(True) real_data = batch['shower'] # calo image real_data = real_data.to(device) ## forward pass to generator fake_data = aG(noise, real_label.float()) fake_data = fake_data.unsqueeze( 1) ## transform to [BS, 1, layer, size, size] ### first LossD_P fake_dataP = aP(fake_data.float(), real_label.float()) lossD_P = aD(fake_dataP.float(), real_label.float()) lossD_P = lossD_P.mean() ## lossFixP real_sorted = real_data.view(BATCH_SIZE, -1) fake_sorted = fake_dataP.view(BATCH_SIZE, -1) real_sorted, _ = torch.sort(real_sorted, dim=1, descending=True) #.view(900,1) fake_sorted, _ = torch.sort(fake_sorted, dim=1, descending=True) #.view(900,1) lossFixPp1 = mmd_hit_sortKernel(real_sorted.float(), fake_sorted, kernel_size=100, stride=50, cutoff=2000, alpha=200) lossFixPp2 = F.mse_loss(fake_dataP.view(BATCH_SIZE, -1), fake_data.detach().view( BATCH_SIZE, -1), reduction='mean') lossFixP = wMMD * lossFixPp1 + wMSE * lossFixPp2 lossP = LDP * lossD_P - lossFixP lossP.backward(mone) optimizer_p.step() if iteration % 100 == 0 or iteration == 1: print('iteration: {}, critic loss: {}'.format( iteration, disc_cost.cpu().data.numpy())) if rank == 0: torch.save( { 'Generator': aG.state_dict(), 'Critic': aD.state_dict(), 'G_optimizer': optimizer_g.state_dict(), 'D_optimizer': optimizer_d.state_dict(), 'iteration': iteration }, OUTP + '{0}/wgan_itrs_{1}.pth'.format(EXP, iteration)) if params["train_calib"]: torch.save( aE.state_dict(), OUTP + '/{0}/netE_itrs_{1}.pth'.format(EXP, iteration)) if params["train_postP"]: torch.save( aP.state_dict(), OUTP + '{0}/netP_itrs_{1}.pth'.format(EXP, iteration))
class CometMLMonitor(MonitorBase): """ Send scalar data and the graph to https://www.comet.ml. Note: 1. comet_ml requires you to `import comet_ml` before importing tensorflow or tensorpack. 2. The "automatic output logging" feature of comet_ml will make the training progress bar appear to freeze. Therefore the feature is disabled by default. """ def __init__(self, experiment=None, tags=None, **kwargs): """ Args: experiment (comet_ml.Experiment): if provided, invalidate all other arguments tags (list[str]): experiment tags kwargs: arguments used to initialize :class:`comet_ml.Experiment`, such as project name, API key, etc. Refer to its documentation for details. """ if experiment is not None: self._exp = experiment assert tags is None and len(kwargs) == 0 else: from comet_ml import Experiment kwargs.setdefault( 'log_code', True ) # though it's not functioning, git patch logging requires it kwargs.setdefault('auto_output_logging', None) self._exp = Experiment(**kwargs) if tags is not None: self._exp.add_tags(tags) self._exp.set_code("Code logging is impossible ...") self._exp.log_dependency('tensorpack', __git_version__) @property def experiment(self): """ The :class:`comet_ml.Experiment` instance. """ return self._exp def _before_train(self): self._exp.set_model_graph(tf.get_default_graph()) @HIDE_DOC def process_scalar(self, name, val): self._exp.log_metric(name, val, step=self.global_step) @HIDE_DOC def process_image(self, name, val): self._exp.set_step(self.global_step) for idx, v in enumerate(val): log_name = "{}_step{}{}".format( name, self.global_step, "_" + str(idx) if len(val) > 1 else "") self._exp.log_image(v, image_format="jpeg", name=log_name, image_minmax=(0, 255)) def _after_train(self): self._exp.end() def _after_epoch(self): self._exp.log_epoch_end(self.epoch_num)
import model epochs = 100000 steps_per_epoch = 0 total_images_looked_at = 0 d_steps = 1 g_steps = 1 graph, iterator, d_train_optimizer_ops, d_stabilize_optimizer_ops, g_train_optimizer_ops, g_stabilize_optimizer_ops, samples_for_all_resolutions, sizes = model.get_graph() experiment = Experiment(api_key='<API_KEY>', project_name='art_pgan', workspace='schmidtdominik', log_code=False) experiment.log_parameters({'G_learning_rate': model.g_learning_rate, 'D_learning_rate': model.d_learning_rate, 'D_steps': d_steps, 'G_steps': g_steps, 'batch_size': model.batch_size}) experiment.set_model_graph(graph) experiment.set_code('\n# [code]: train.py\n' + open('train.py', 'r').read() + '\n# [code]: image_pipeline.py\n' + open('image_pipeline.py', 'r').read() + '\n# [code]: model.py\n' + open( 'model.py', 'r').read() + '\n# [code]: discriminator.py\n' + open('discriminator.py', 'r').read() + '\n# [code]: generator.py\n' + open( 'generator.py', 'r').read()) try: os.mkdir('./checkpoints/') except FileExistsError: pass try: os.mkdir('./progress_images/') except FileExistsError: pass current_resolution = sizes[0] current_mode = 'train' last_schedule_update = 0 last_schedule_update_time = time.time() schedule_finalized = False
model = model_class(**model_kwargs) if args.load: s1 = torch.load(args.load, map_location=torch.device('cpu')) s2 = {k.replace("module.", ""): v for k, v in s1.items()} model.load_state_dict(s2) if multi_gpu: model = torch_geometric.nn.DataParallel(model) model.to(device) model_fname = get_model_fname(args.dataset, model, args.n_train, args.lr, args.target) # need your api key in a .comet.config file: see https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables experiment = Experiment(project_name="particleflow", disabled=args.disable_comet) experiment.set_model_graph(repr(model)) experiment.log_parameters(dict(model_kwargs, **{'model': args.model, 'lr':args.lr, 'model_fname': model_fname, 'l1': args.l1, 'l2':args.l2, 'n_train':args.n_train, 'target':args.target, 'optimizer': args.optimizer})) outpath = osp.join(args.outpath, model_fname) if osp.isdir(outpath): if args.overwrite: print("model output {} already exists, deleting it".format(outpath)) import shutil shutil.rmtree(outpath) else: print("model output {} already exists, please delete it".format(outpath)) sys.exit(0) try: os.makedirs(outpath) except Exception as e:
def main() -> int: parser = argparse.ArgumentParser() parser.add_argument( '--mode', help='Select mode', choices=['train', 'test', 'demo'], default='train', ) args = parser.parse_args() config = yaml.safe_load(open("config.yml")) if config['LOAD_MODEL']: model = DQN( in_channels=config['IN_CHANNELS'], out_dim=config['OUT_DIM'], ) model_name = config['LOAD_MODEL'] model.load_model(model_name) else: model = DQN( in_channels=config['IN_CHANNELS'], out_dim=config['OUT_DIM'], ) if args.mode == 'test': test( device=config['DEVICE'], n_games=config['TEST_GAMES'], model=model, frame_skipping=config['FRAME_SKIPPING'], ) elif args.mode == 'demo': demo( device=config['DEVICE'], model=model, frame_skipping=config['FRAME_SKIPPING'], ) else: memory = ReplayMemory(capacity=config['N']) optimizer_name = config['OPTIMIZER'] if optimizer_name == 'adam': optimizer = torch.optim.Adam(lr=config['LEARNING_RATE'], betas=(0.9, 0.999), eps=1e-8, amsgrad=False, params=model.model.parameters()) elif optimizer_name == 'sgd': optimizer = torch.optim.SGD(lr=config['LEARNING_RATE'], momentum=0.9, params=model.model.parameters()) else: raise ValueError(f'Unknown optimizer name: {optimizer_name}') experiment = Experiment( api_key=os.environ['COMET_ML_API_KEY'], project_name=config['COMET_ML_PROJECT_NAME'], workspace=config['COMET_ML_WORKSPACE'], ) experiment.set_name(config['COMET_ML_NAME']) experiment.add_tag(config['COMET_ML_TAG']) experiment.log_parameters({ 'n_games': config['M'], 'minibatch_size': config['MINIBATCH_SIZE'], 'eps': config['EPS'], 'eps_n_frames': config['EPS_N_FRAMES'], 'gamma': config['GAMMA'], 'frame_skipping': config['FRAME_SKIPPING'], 'save_model_every': config['SAVE_MODEL_EVERY'] }) experiment.set_model_graph(str(model.model)) train( device=config['DEVICE'], n_games=config['M'], memory=memory, optimizer=optimizer, model=model, experiment=experiment, minibatch_size=config['MINIBATCH_SIZE'], eps=config['EPS'], eps_n_frames=config['EPS_N_FRAMES'], gamma=config['GAMMA'], frame_skipping=config['FRAME_SKIPPING'], update_model_target_every=config['UPDATE_MODEL_TARGET_EVERY'], save_model_every=config['SAVE_MODEL_EVERY'], save_model_as=config['SAVE_MODEL_AS'], save_average_metrics_every=config['SAVE_AVERAGE_METRICS_EVERY'], )
class ModelTrainer: def __init__(self, model, dataloader, args): self.model = model self.args = args self.data = dataloader self.metric = args.metric if (dataloader is not None): self.frq_log = len(dataloader['train']) // args.frq_log self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') model.to(self.device) if args.optimizer == 'sgd': self.optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'adam': self.optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, 0.999), weight_decay=args.weight_decay) else: raise Exception('--optimizer should be one of {sgd, adam}') if args.scheduler == 'set': self.scheduler = optim.lr_scheduler.LambdaLR( self.optimizer, lambda epoch: 10**(epoch / args.scheduler_factor)) elif args.scheduler == 'auto': self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode='min', factor=args.scheduler_factor, patience=5, verbose=True, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) self.experiment = Experiment(api_key=args.comet_key, project_name=args.comet_project, workspace=args.comet_workspace, auto_weight_logging=True, auto_metric_logging=False, auto_param_logging=False) self.experiment.set_name(args.name) self.experiment.log_parameters(vars(args)) self.experiment.set_model_graph(str(self.model)) def train_one_epoch(self, epoch): self.model.train() train_loader = self.data['train'] train_loss = 0 correct = 0 comet_offset = epoch * len(train_loader) for batch_idx, (data, target) in tqdm(enumerate(train_loader), leave=True, total=len(train_loader)): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) loss = F.cross_entropy(output, target, reduction='sum') loss.backward() self.optimizer.step() pred = output.argmax(dim=1, keepdim=True) acc = pred.eq(target.view_as(pred)).sum().item() train_loss += loss.item() correct += acc loss = loss.item() / len(data) acc = 100. * acc / len(data) comet_step = comet_offset + batch_idx self.experiment.log_metric('batch_loss', loss, comet_step, epoch) self.experiment.log_metric('batch_acc', acc, comet_step, epoch) if (batch_idx + 1) % self.frq_log == 0: self.experiment.log_metric('log_loss', loss, comet_step, epoch) self.experiment.log_metric('log_acc', acc, comet_step, epoch) print('Epoch: {} [{}/{}]\tLoss: {:.6f}\tAcc: {:.2f}%'.format( epoch + 1, (batch_idx + 1) * len(data), len(train_loader.dataset), loss, acc)) train_loss /= len(train_loader.dataset) acc = 100. * correct / len(train_loader.dataset) comet_step = comet_offset + len(train_loader) - 1 self.experiment.log_metric('loss', train_loss, comet_step, epoch) self.experiment.log_metric('acc', acc, comet_step, epoch) print( 'Epoch: {} [Done]\tLoss: {:.4f}\tAccuracy: {}/{} ({:.2f}%)'.format( epoch + 1, train_loss, correct, len(train_loader.dataset), acc)) return {'loss': train_loss, 'acc': acc} def train(self): self.log_cmd() best = -1 history = {'lr': [], 'train_loss': []} try: print(">> Training %s" % self.model.name) for epoch in range(self.args.nepoch): with self.experiment.train(): train_res = self.train_one_epoch(epoch) with self.experiment.validate(): print("\nvalidation...") comet_offset = (epoch + 1) * len(self.data['train']) - 1 res = self.val(self.data['val'], comet_offset, epoch) if res[self.metric] > best: best = res[self.metric] self.save_weights(epoch) if self.args.scheduler == 'set': lr = self.optimizer.param_groups[0]['lr'] history['lr'].append(lr) history['train_loss'].append(train_res['loss']) self.scheduler.step(epoch + 1) lr = self.optimizer.param_groups[0]['lr'] print('learning rate changed to: %.10f' % lr) elif self.args.scheduler == 'auto': self.scheduler.step(train_res['loss']) finally: print(">> Training model %s. [Stopped]" % self.model.name) self.experiment.log_asset_folder(os.path.join( self.args.outf, self.args.name, 'weights'), step=None, log_file_name=False, recursive=False) if self.args.scheduler == 'set': plt.semilogx(history['lr'], history['train_loss']) plt.grid(True) self.experiment.log_figure(figure=plt) plt.show() def val(self, val_loader, comet_offset=-1, epoch=-1): self.model.eval() test_loss = 0 correct = 0 labels = list(range(self.args.nclass)) cm = np.zeros((len(labels), len(labels))) with torch.no_grad(): for data, target in tqdm(val_loader, leave=True, total=len(val_loader)): data, target = data.to(self.device), target.to(self.device) output = self.model(data) test_loss += F.cross_entropy(output, target, reduction='sum').item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() pred = pred.view_as(target).data.cpu().numpy() target = target.data.cpu().numpy() cm += confusion_matrix(target, pred, labels=labels) test_loss /= len(val_loader.dataset) accuracy = 100. * correct / len(val_loader.dataset) print('Evaluation: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'. format(test_loss, correct, len(val_loader.dataset), accuracy)) res = {'loss': test_loss, 'acc': accuracy} self.experiment.log_metrics(res, step=comet_offset, epoch=epoch) self.experiment.log_confusion_matrix( matrix=cm, labels=[ClassDict.getName(x) for x in labels], title='confusion matrix after epoch %03d' % epoch, file_name="confusion_matrix_%03d.json" % epoch) return res def test(self): self.load_weights() with self.experiment.test(): print('\ntesting....') res = self.val(self.data['test']) def log_cmd(self): d = vars(self.args) cmd = '!python main.py \\\n' tab = ' ' for k, v in d.items(): if v is None or v == '' or (isinstance(v, bool) and v is False): continue if isinstance(v, bool): arg = '--{} \\\n'.format(k) else: arg = '--{} {} \\\n'.format(k, v) cmd = cmd + tab + arg # print(cmd); self.experiment.log_text(cmd) def save_weights(self, epoch: int): weight_dir = os.path.join(self.args.outf, self.args.name, 'weights') if not os.path.exists(weight_dir): os.makedirs(weight_dir) torch.save({ 'epoch': epoch, 'state_dict': self.model.state_dict() }, os.path.join(weight_dir, 'model.pth')) def load_weights(self): path_g = self.args.weights_path if path_g is None: weight_dir = os.path.join(self.args.outf, self.args.name, 'weights') path_g = os.path.join(weight_dir, 'model.pth') print('>> Loading weights...') weights_g = torch.load(path_g, map_location=self.device)['state_dict'] self.model.load_state_dict(weights_g) print(' Done.') def predict(self, x): x = x / 2**15 self.model.eval() with torch.no_grad(): x = torch.from_numpy(x).float() x = self.transform(x) x = x.unsqueeze(0) x = self.model(x) x = F.softmax(x, dim=1) x = x.numpy() return x
if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Get confusion matrix picture before training: log_confusion_matrix(experiment, model, trainer, 0, 0) # Run the training trainer.run() # Report created images to comet.ml: ## If you want to include a graph made by chainer, you can: #if args.plot and extensions.PlotReport.available(): # experiment.log_image('result/loss.png') # experiment.log_image('result/accuracy.png') # Report the graph, as dot language: (graph, ) = pydot.graph_from_dot_file('result/cg.dot') graph.write_png('result/cg.png') experiment.log_image('result/cg.png') with open("result/cg.dot") as fp: desc = fp.readlines() experiment.set_model_graph("\n".join(desc)) # Report a URL: experiment.log_html_url( "https://github.com/chainer/chainer/" "blob/master/examples/mnist/train_mnist.py", label="This MNIST example is based on")
def main(cfg: DictConfig): cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Random Seed seed_everything(cfg.train.seed) # Model #################################################################### net = ENet(model_name=cfg.train.model_name) transform = ImageTransform(img_size=cfg.data.img_size) # Comet.ml experiment = Experiment(api_key=cfg.comet_ml.api_key, project_name=cfg.comet_ml.project_name) # Log Parameters experiment.log_parameters(dict(cfg.exp)) experiment.log_parameters(dict(cfg.data)) experiment.log_parameters(dict(cfg.train)) # Log Model Graph experiment.set_model_graph(str(net)) # Lightning Module ######################################################### model = LightningSystem(net, cfg, experiment) datamodule = DataModule(data_dir, cfg, transform, cv) checkpoint_callback = ModelCheckpoint(filepath='./checkpoint', save_top_k=1, verbose=True, monitor='avg_val_loss', mode='min', prefix=cfg.exp.exp_name + '_') trainer = Trainer(logger=False, max_epochs=cfg.train.epoch, checkpoint_callback=checkpoint_callback, gpus=1) # Train & Test ############################################################ # Train trainer.fit(model, datamodule=datamodule) experiment.log_metric('best_auc', model.best_auc) checkpoint_path = glob.glob(f'./checkpoint/{cfg.exp.exp_name}_*.ckpt')[0] experiment.log_asset(file_data=checkpoint_path) # Test for i in range(test_num): trainer.test(model) # Submit sub_list = glob.glob(f'submission_{cfg.exp.exp_name}*.csv') _ = summarize_submit(sub_list, experiment, filename=f'sub_{cfg.exp.exp_name}.csv') # oof oof_dataset = datamodule.oof_dataset oof_dataloader = DataLoader(oof_dataset, batch_size=cfg.train.batch_size, pin_memory=False, shuffle=False, drop_last=False) for i in range(10): trainer.test(model, test_dataloaders=oof_dataloader) # Submit sub_list = glob.glob('submission*.csv') _ = summarize_submit(sub_list, experiment, filename=f'oof_{cfg.exp.exp_name}.csv')
# Gradient updates updates = minimizer.apply_gradients(clipped_grads_and_vars) def one_hot(v): return np.eye(vocab_size)[v] # begin training sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) #log model graph exp.set_model_graph(sess.graph) # Initial values MAXITERS = 500000 n, p = 0, 0 hprev_val = np.zeros([1, hidden_size]) while (n < MAXITERS): # Initialize if p + seq_length + 1 >= len(data) or n == 0: hprev_val = np.zeros([1, hidden_size]) p = 0 # reset # Prepare inputs input_vals = [char_to_ix[ch] for ch in data[p:p + seq_length]] target_vals = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]
def main(_): experiment = Experiment(api_key="xXtJguCo8yFdU7dpjEpo6YbHw", project_name=args.experiment_name) hyper_params = { "learning_rate": args.lr, "num_epochs": args.max_epoch, "batch_size": args.single_batch_size, "alpha": args.alpha, "beta": args.beta, "gamma": args.gamma, "loss": args.loss } experiment.log_multiple_params(hyper_params) # TODO: split file support with tf.Graph().as_default(): global save_model_dir start_epoch = 0 global_counter = 0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": cfg.GPU_USE_COUNT, }, allow_soft_placement=True, log_device_placement=False, ) with tf.Session(config=config) as sess: # sess=tf_debug.LocalCLIDebugWrapperSession(sess,ui_type='readline') model = RPN3D(cls=cfg.DETECT_OBJ, single_batch_size=args.single_batch_size, learning_rate=args.lr, max_gradient_norm=5.0, alpha=args.alpha, beta=args.beta, gamma=args.gamma, loss_type=args.loss, avail_gpus=cfg.GPU_AVAILABLE.split(',')) # param init/restore if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from %s" % save_model_dir) model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir)) start_epoch = model.epoch.eval() + 1 global_counter = model.global_step.eval() + 1 else: print("Created model with fresh parameters.") tf.global_variables_initializer().run() # train and validate is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_val_interval = 10 summary_writer = tf.summary.FileWriter(log_dir, sess.graph) experiment.set_model_graph(sess.graph) # training with experiment.train(): for epoch in range(start_epoch, args.max_epoch): counter = 0 batch_time = time.time() experiment.log_current_epoch(epoch) for batch in iterate_data( train_dir, shuffle=True, aug=True, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): counter += 1 global_counter += 1 experiment.set_step(global_counter) if counter % summary_interval == 0: is_summary = True else: is_summary = False epochs = args.max_epoch start_time = time.time() ret = model.train_step(sess, batch, train=True, summary=is_summary) forward_time = time.time() - start_time batch_time = time.time() - batch_time param = ret params = { "loss": param[0], "cls_loss": param[1], "cls_pos_loss": param[2], "cls_neg_loss": param[3] } experiment.log_multiple_metrics(params) # print(ret) print( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}' .format(counter, epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time)) # with open('log/train.txt', 'a') as f: # f.write( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'.format(counter,epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time)) #print(counter, summary_interval, counter % summary_interval) if counter % summary_interval == 0: print("summary_interval now") summary_writer.add_summary(ret[-1], global_counter) #print(counter, summary_val_interval, counter % summary_val_interval) if counter % summary_val_interval == 0: print("summary_val_interval now") batch = sample_test_data( val_dir, args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT) ret = model.validate_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) try: ret = model.predict_step(sess, batch, summary=True) summary_writer.add_summary( ret[-1], global_counter) except: print("prediction skipped due to error") if check_if_should_pause(args.tag): model.saver.save(sess, os.path.join( save_model_dir, 'checkpoint'), global_step=model.global_step) print('pause and save model @ {} steps:{}'.format( save_model_dir, model.global_step.eval())) sys.exit(0) batch_time = time.time() experiment.log_epoch_end(epoch) sess.run(model.epoch_add_op) model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) # dump test data every 10 epochs if (epoch + 1) % 10 == 0: # create output folder os.makedirs(os.path.join(args.output_path, str(epoch)), exist_ok=True) os.makedirs(os.path.join(args.output_path, str(epoch), 'data'), exist_ok=True) if args.vis: os.makedirs(os.path.join(args.output_path, str(epoch), 'vis'), exist_ok=True) for batch in iterate_data( val_dir, shuffle=False, aug=False, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): if args.vis: tags, results, front_images, bird_views, heatmaps = model.predict_step( sess, batch, summary=False, vis=True) else: tags, results = model.predict_step( sess, batch, summary=False, vis=False) for tag, result in zip(tags, results): of_path = os.path.join(args.output_path, str(epoch), 'data', tag + '.txt') with open(of_path, 'w+') as f: labels = box3d_to_label( [result[:, 1:8]], [result[:, 0]], [result[:, -1]], coordinate='lidar')[0] for line in labels: f.write(line) print('write out {} objects to {}'.format( len(labels), tag)) # dump visualizations if args.vis: for tag, front_image, bird_view, heatmap in zip( tags, front_images, bird_views, heatmaps): front_img_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_front.jpg') bird_view_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_bv.jpg') heatmap_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_heatmap.jpg') cv2.imwrite(front_img_path, front_image) cv2.imwrite(bird_view_path, bird_view) cv2.imwrite(heatmap_path, heatmap) # execute evaluation code cmd_1 = "./kitti_eval/launch_test.sh" cmd_2 = os.path.join(args.output_path, str(epoch)) cmd_3 = os.path.join(args.output_path, str(epoch), 'log') os.system(" ".join([cmd_1, cmd_2, cmd_3])) print('train done. total epoch:{} iter:{}'.format( epoch, model.global_step.eval())) # finallly save model model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)
def main(): opt = parse_option() print(pp.pformat(vars(opt))) train_partition = "trainval" if opt.use_trainval else "train" if opt.dataset == "miniImageNet": train_trans, test_trans = transforms_options[opt.transform] if opt.augment == "none": train_train_trans = train_test_trans = test_trans elif opt.augment == "all": train_train_trans = train_test_trans = train_trans elif opt.augment == "spt": train_train_trans = train_trans train_test_trans = test_trans elif opt.augment == "qry": train_train_trans = test_trans train_test_trans = train_trans print("spt trans") print(train_train_trans) print("qry trans") print(train_test_trans) sub_batch_size, rmd = divmod(opt.batch_size, opt.apply_every) assert rmd == 0 print("Train sub batch-size:", sub_batch_size) meta_train_dataset = MetaImageNet( args=opt, partition="train", train_transform=train_train_trans, test_transform=train_test_trans, fname="miniImageNet_category_split_train_phase_%s.pickle", fix_seed=False, n_test_runs=10000000, # big number to never stop new_labels=False, ) meta_trainloader = DataLoader( meta_train_dataset, batch_size=sub_batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers, pin_memory=True, ) meta_train_dataset_qry = MetaImageNet( args=opt, partition="train", train_transform=train_train_trans, test_transform=train_test_trans, fname="miniImageNet_category_split_train_phase_%s.pickle", fix_seed=False, n_test_runs=10000000, # big number to never stop new_labels=False, n_ways=opt.n_qry_way, n_shots=opt.n_qry_shot, n_queries=0, ) meta_trainloader_qry = DataLoader( meta_train_dataset_qry, batch_size=sub_batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers, pin_memory=True, ) meta_val_dataset = MetaImageNet( args=opt, partition="val", train_transform=test_trans, test_transform=test_trans, fix_seed=False, n_test_runs=200, n_ways=5, n_shots=5, n_queries=15, ) meta_valloader = DataLoader( meta_val_dataset, batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers, pin_memory=True, ) val_loader = DataLoader( ImageNet(args=opt, partition="val", transform=test_trans), batch_size=opt.sup_val_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers, pin_memory=True, ) # if opt.use_trainval: # n_cls = 80 # else: # n_cls = 64 n_cls = len(meta_train_dataset.classes) print(n_cls) # x_spt, y_spt, x_qry, y_qry = next(iter(meta_trainloader)) # x_spt2, y_spt2, x_qry2, y_qry2 = next(iter(meta_trainloader_qry)) # print(x_spt, y_spt, x_qry, y_qry) # print(x_spt2, y_spt2, x_qry2, y_qry2) # print(x_spt.shape, y_spt.shape, x_qry.shape, y_qry.shape) # print(x_spt2.shape, y_spt2.shape, x_qry2.shape, y_qry2.shape) model = create_model( opt.model, n_cls, opt.dataset, opt.drop_rate, opt.dropblock, opt.track_stats, opt.initializer, opt.weight_norm, activation=opt.activation, normalization=opt.normalization, ) print(model) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): print(torch.cuda.get_device_name()) device = torch.device("cuda") # if opt.n_gpu > 1: # model = nn.DataParallel(model) model = model.to(device) criterion = criterion.to(device) cudnn.benchmark = True else: device = torch.device("cpu") print("Learning rate") print(opt.learning_rate) print("Inner Learning rate") print(opt.inner_lr) if opt.learn_lr: print("Optimizing learning rate") inner_lr = nn.Parameter(torch.tensor(opt.inner_lr), requires_grad=opt.learn_lr) optimizer = torch.optim.Adam( list(model.parameters()) + [inner_lr] if opt.learn_lr else model.parameters(), lr=opt.learning_rate, ) # classifier = model.classifier() inner_opt = torch.optim.SGD( model.classifier.parameters(), lr=opt.inner_lr, ) logger = SummaryWriter(logdir=opt.tb_folder, flush_secs=10, comment=opt.model_name) comet_logger = Experiment( api_key=os.environ["COMET_API_KEY"], project_name=opt.comet_project_name, workspace=opt.comet_workspace, disabled=not opt.logcomet, auto_metric_logging=False, ) comet_logger.set_name(opt.model_name) comet_logger.log_parameters(vars(opt)) comet_logger.set_model_graph(str(model)) if opt.cosine: eta_min = opt.learning_rate * opt.cosine_factor scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, opt.num_steps, eta_min, -1) # routine: supervised pre-training data_sampler = iter(meta_trainloader) data_sampler_qry = iter(meta_trainloader_qry) pbar = tqdm( range(1, opt.num_steps + 1), miniters=opt.print_freq, mininterval=3, maxinterval=30, ncols=0, ) best_val_acc = 0.0 for step in pbar: if not opt.cosine: adjust_learning_rate(step, opt, optimizer) # print("==> training...") time1 = time.time() foa = 0.0 fol = 0.0 ioa = 0.0 iil = 0.0 fil = 0.0 iia = 0.0 fia = 0.0 for j in range(opt.apply_every): x_spt, y_spt, x_qry, y_qry = [ t.to(device) for t in next(data_sampler) ] x_qry2, y_qry2, _, _ = [ t.to(device) for t in next(data_sampler_qry) ] y_spt = y_spt.flatten(1) y_qry2 = y_qry2.flatten(1) x_qry = torch.cat((x_spt, x_qry, x_qry2), 1) y_qry = torch.cat((y_spt, y_qry, y_qry2), 1) if step == 1 and j == 0: print(x_spt.size(), y_spt.size(), x_qry.size(), y_qry.size()) info = train_step( model, model.classifier, None, # inner_opt, inner_lr, x_spt, y_spt, x_qry, y_qry, reset_head=opt.reset_head, num_steps=opt.num_inner_steps, ) _foa = info["foa"] / opt.batch_size _fol = info["fol"] / opt.batch_size _ioa = info["ioa"] / opt.batch_size _iil = info["iil"] / opt.batch_size _fil = info["fil"] / opt.batch_size _iia = info["iia"] / opt.batch_size _fia = info["fia"] / opt.batch_size _fol.backward() foa += _foa.detach() fol += _fol.detach() ioa += _ioa.detach() iil += _iil.detach() fil += _fil.detach() iia += _iia.detach() fia += _fia.detach() optimizer.step() optimizer.zero_grad() inner_lr.data.clamp_(min=0.001) if opt.cosine: scheduler.step() if (step == 1) or (step % opt.eval_freq == 0): val_info = test_run( iter(meta_valloader), model, model.classifier, torch.optim.SGD(model.classifier.parameters(), lr=inner_lr.item()), num_inner_steps=opt.num_inner_steps_test, device=device, ) val_acc_feat, val_std_feat = meta_test( model, meta_valloader, use_logit=False, ) val_acc = val_info["outer"]["acc"].cpu() val_loss = val_info["outer"]["loss"].cpu() sup_acc, sup_acc_top5, sup_loss = validate( val_loader, model, criterion, print_freq=100000000, ) sup_acc = sup_acc.item() sup_acc_top5 = sup_acc_top5.item() print(f"\nValidation step {step}") print(f"MAML 5-way-5-shot accuracy: {val_acc.item()}") print(f"LR 5-way-5-shot accuracy: {val_acc_feat}+-{val_std_feat}") print( f"Supervised accuracy: Acc@1: {sup_acc} Acc@5: {sup_acc_top5} Loss: {sup_loss}" ) if val_acc_feat > best_val_acc: best_val_acc = val_acc_feat print( f"New best validation accuracy {best_val_acc.item()} saving checkpoints\n" ) # print(val_acc.item()) torch.save( { "opt": opt, "model": model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(), "optimizer": optimizer.state_dict(), "step": step, "val_acc": val_acc, "val_loss": val_loss, "val_acc_lr": val_acc_feat, "sup_acc": sup_acc, "sup_acc_top5": sup_acc_top5, "sup_loss": sup_loss, }, os.path.join(opt.save_folder, "{}_best.pth".format(opt.model)), ) comet_logger.log_metrics( dict( fol=val_loss, foa=val_acc, acc_lr=val_acc_feat, sup_acc=sup_acc, sup_acc_top5=sup_acc_top5, sup_loss=sup_loss, ), step=step, prefix="val", ) logger.add_scalar("val_acc", val_acc, step) logger.add_scalar("val_loss", val_loss, step) logger.add_scalar("val_acc_lr", val_acc_feat, step) logger.add_scalar("sup_acc", sup_acc, step) logger.add_scalar("sup_acc_top5", sup_acc_top5, step) logger.add_scalar("sup_loss", sup_loss, step) if (step == 1) or (step % opt.eval_freq == 0) or (step % opt.print_freq == 0): tfol = fol.cpu() tfoa = foa.cpu() tioa = ioa.cpu() tiil = iil.cpu() tfil = fil.cpu() tiia = iia.cpu() tfia = fia.cpu() comet_logger.log_metrics( dict( fol=tfol, foa=tfoa, ioa=tfoa, iil=tiil, fil=tfil, iia=tiia, fia=tfia, ), step=step, prefix="train", ) logger.add_scalar("train_acc", tfoa.item(), step) logger.add_scalar("train_loss", tfol.item(), step) logger.add_scalar("train_ioa", tioa, step) logger.add_scalar("train_iil", tiil, step) logger.add_scalar("train_fil", tfil, step) logger.add_scalar("train_iia", tiia, step) logger.add_scalar("train_fia", tfia, step) pbar.set_postfix( # iol=f"{info['iol'].item():.2f}", fol=f"{tfol.item():.2f}", # ioa=f"{info['ioa'].item():.2f}", foa=f"{tfoa.item():.2f}", ioa=f"{tioa.item():.2f}", iia=f"{tiia.item():.2f}", fia=f"{tfia.item():.2f}", vl=f"{val_loss.item():.2f}", va=f"{val_acc.item():.2f}", valr=f"{val_acc_feat:.2f}", lr=f"{inner_lr.item():.4f}", vsa=f"{sup_acc:.2f}", # iil=f"{info['iil'].item():.2f}", # fil=f"{info['fil'].item():.2f}", # iia=f"{info['iia'].item():.2f}", # fia=f"{info['fia'].item():.2f}", # counter=info["counter"], refresh=True, ) # save the last model state = { "opt": opt, "model": model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(), "optimizer": optimizer.state_dict(), "step": step, } save_file = os.path.join(opt.save_folder, "{}_last.pth".format(opt.model)) torch.save(state, save_file)
def train(defparams, hyper): params = {} for param in defparams.keys(): params[param] = defparams[param] hyperp = {} for hp in hyper.keys(): hyperp[hp] = hyper[hp] experiment = Experiment(api_key="keGmeIz4GfKlQZlOP6cit4QOi", project_name="hadron-shower", workspace="engineren") experiment.add_tag(params['exp']) experiment.log_parameters(hyperp) device = torch.device("cuda") torch.manual_seed(params["seed"]) aD = DCGAN_D(hyperp["ndf"]).to(device) aG = DCGAN_G(hyperp["ngf"], hyperp["z"]).to(device) aE = energyRegressor().to(device) #aP = PostProcess_Size1Conv_EcondV2(30, 3, 128, bias=True, out_funct='none').to(device) experiment.set_model_graph(str(aG)) experiment.set_model_graph(str(aD)) ## no need for post processing now #if params["restore_pp"]: # aP.load_state_dict(torch.load(params["restore_path_PP"] + params["post_saved"], map_location=torch.device(device))) if params["restore"]: aG.load_state_dict(torch.load(params["restore_path"] + params["gen_saved"], map_location=torch.device(device))) aD.load_state_dict(torch.load(params["restore_path"] + params["crit_saved"], map_location=torch.device(device))) else: aG.apply(weights_init) aD.apply(weights_init) if params["c0"]: aE.apply(weights_init) elif params["c1"] : aE.load_state_dict(torch.load(params["calib_saved"], map_location=torch.device(device))) one = torch.tensor(1.0).to(device) mone = (one * -1).to(device) print('loading data...') paths_list = [ '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part1.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part2.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part3.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part4.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part5.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part6.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part7.hdf5' ] train_data = PionsDataset(paths_list) dataloader = DataLoader(train_data, shuffle=True, batch_size=hyperp["batch_size"], num_workers=10) print('done') optimizer_g = torch.optim.Adam(aG.parameters(), lr=hyperp["L_gen"], betas=(0.5, 0.9)) optimizer_d = torch.optim.Adam(aD.parameters(), lr=hyperp["L_crit"], betas=(0.5, 0.9)) optimizer_e = torch.optim.SGD(aE.parameters(), lr=hyperp["L_calib"]) #optimizer_p = torch.optim.Adam(aP.parameters(), lr=params["L_post"], betas=(0.5, 0.9)) #scheduler_g = optim.lr_scheduler.StepLR(optimizer_g, step_size=1, gamma=params["gamma_g"]) #scheduler_d = optim.lr_scheduler.StepLR(optimizer_d, step_size=1, gamma=params["gamma_crit"]) #scheduler_e = optim.lr_scheduler.StepLR(optimizer_e, step_size=1, gamma=params["gamma_calib"]) #writer = SummaryWriter() e_criterion = nn.L1Loss() # for energy regressor training dataiter = iter(dataloader) BATCH_SIZE = hyperp["batch_size"] LATENT = hyperp["z"] EXP = params["exp"] KAPPA = hyperp["kappa"] LAMBD = hyperp["lambda"] ## Post-Processing LDP = hyperp["LDP"] wMMD = hyperp["wMMD"] wMSE = hyperp["wMSE"] ## IO paths OUTP = params['output_path'] for iteration in range(1, 75000): #---------------------TRAIN D------------------------ for p in aD.parameters(): # reset requires_grad p.requires_grad_(True) # they are set to False below in training G for e in aE.parameters(): # reset requires_grad (constrainer) e.requires_grad_(True) # they are set to False below in training G for i in range(hyperp["ncrit"]): aD.zero_grad() aE.zero_grad() noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view(-1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch['energy'] ## energy label real_label = real_label.to(device) with torch.no_grad(): noisev = noise # totally freeze G, training D fake_data = aG(noisev, real_label).detach() real_data = batch['shower'] # 48x48x48 calo image real_data = real_data.to(device) real_data.requires_grad_(True) #### supervised-training for energy regressor! if params["train_calib"] : output = aE(real_data.float()) e_loss = e_criterion(output, real_label.view(BATCH_SIZE, 1)) e_loss.backward() optimizer_e.step() ###### # train with real data disc_real = aD(real_data.float(), real_label.float()) # train with fake data fake_data = fake_data.unsqueeze(1) ## transform to [BS, 1, 30, 30, 30] disc_fake = aD(fake_data, real_label.float()) # train with interpolated data gradient_penalty = calc_gradient_penalty(aD, real_data, fake_data, real_label, BATCH_SIZE, device, DIM=48) ## wasserstein-1 distace w_dist = torch.mean(disc_fake) - torch.mean(disc_real) # final disc cost disc_cost = torch.mean(disc_fake) - torch.mean(disc_real) + LAMBD * gradient_penalty disc_cost.backward() optimizer_d.step() #--------------Log to COMET ML ---------- if i == hyperp["ncrit"]-1: experiment.log_metric("L_crit", disc_cost, step=iteration) experiment.log_metric("gradient_pen", gradient_penalty, step=iteration) experiment.log_metric("Wasserstein Dist", w_dist, step=iteration) if params["train_calib"]: experiment.log_metric("L_const", e_loss, step=iteration) #---------------------TRAIN G------------------------ for p in aD.parameters(): p.requires_grad_(False) # freeze D for c in aE.parameters(): c.requires_grad_(False) # freeze C gen_cost = None for i in range(hyperp["ngen"]): aG.zero_grad() noise = np.random.uniform(-1,1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view(-1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch['energy'] ## energy label real_label = real_label.to(device) noise.requires_grad_(True) real_data = batch['shower'] # 48x48x48 calo image real_data = real_data.to(device) fake_data = aG(noise, real_label.float()) fake_data = fake_data.unsqueeze(1) ## transform to [BS, 1, 30, 30, 30] ## calculate loss function gen_cost = aD(fake_data.float(), real_label.float()) ## label conditioning output_g = aE(fake_data) output_r = aE(real_data.float()) aux_fake = (output_g - real_label)**2 aux_real = (output_r - real_label)**2 aux_errG = torch.abs(aux_fake - aux_real) ## Total loss function for generator g_cost = -torch.mean(gen_cost) + KAPPA*torch.mean(aux_errG) g_cost.backward() optimizer_g.step() #--------------Log to COMET ML ---------- experiment.log_metric("L_Gen", g_cost, step=iteration) #experiment.log_metric("L_aux", aux_errG, step=iteration) #end = timer() #print(f'---train G elapsed time: {end - start}') if params["train_postP"]: #---------------------TRAIN P------------------------ for p in aD.parameters(): p.requires_grad_(False) # freeze D for c in aG.parameters(): c.requires_grad_(False) # freeze G lossP = None for i in range(1): noise = np.random.uniform(-1,1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view(-1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch[1] ## energy label real_label = real_label.unsqueeze(-1) ## transform to [Bs, 1 ] real_label = real_label.to(device) real_label = real_label.view(-1, 1, 1, 1, 1) #[BS,1] ---> [BS,1,1,1,1] Needed for Generator noise.requires_grad_(True) real_data = batch[0] # 30x30x30 calo layers real_data = real_data.unsqueeze(1) #transform to [Bs, 1, 30, 30, 30 ] real_data = real_data.to(device) fake_data = aG(noise, real_label.float()) real_label = real_label.view(BATCH_SIZE, 1) ## transform back : [BS,1,1,1,1] -- > [BS,1] fake_data = fake_data.unsqueeze(1) ## transform to [BS, 1, 30, 30, 30] ### first LossD_P fake_dataP = aP(fake_data.float(), real_label.float()) lossD_P = aD(fake_dataP.float(), real_label.float()) lossD_P = lossD_P.mean() ## lossFixP real_sorted = real_data.view(BATCH_SIZE, -1) fake_sorted = fake_dataP.view(BATCH_SIZE, -1) real_sorted, _ = torch.sort(real_sorted, dim=1, descending=True) #.view(900,1) fake_sorted, _ = torch.sort(fake_sorted, dim=1, descending=True) #.view(900,1) lossFixPp1 = mmd_hit_sortKernel(real_sorted.float(), fake_sorted, kernel_size=100, stride=50, cutoff=2000, alpha=200) lossFixPp2 = F.mse_loss(fake_dataP.view(BATCH_SIZE, -1), fake_data.detach().view(BATCH_SIZE, -1), reduction='mean') lossFixP = wMMD*lossFixPp1 + wMSE*lossFixPp2 lossP = LDP*lossD_P - lossFixP lossP.backward(mone) optimizer_p.step() #if params["train_postP"]: # writer.add_scalar('data/lossD_P', lossD_P.mean(), iteration) # writer.add_scalar('data/lossMSE', lossFixPp2.mean(), iteration) # writer.add_scalar('data/lossMMD', lossFixPp1.mean(), iteration) if iteration % 1000==999 or iteration == 1 : print ('iteration: {}, critic loss: {}'.format(iteration, disc_cost.cpu().data.numpy()) ) torch.save(aG.state_dict(), OUTP+'{0}/netG_itrs_{1}.pth'.format(EXP, iteration)) torch.save(aD.state_dict(), OUTP+'{0}/netD_itrs_{1}.pth'.format(EXP, iteration)) if params["train_calib"] : torch.save(aE.state_dict(), OUTP+'/{0}/netE_itrs_{1}.pth'.format(EXP, iteration)) if params["train_postP"]: torch.save(aP.state_dict(), OUTP+'{0}/netP_itrs_{1}.pth'.format(EXP, iteration))
# Gradient updates updates = minimizer.apply_gradients(clipped_grads_and_vars) def one_hot(v): return np.eye(vocab_size)[v] # begin training sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) # log model graph experiment.set_model_graph(sess.graph) # Initial values MAXITERS = 500000 n, p = 0, 0 hprev_val = np.zeros([1, hidden_size]) while (n < MAXITERS): # Initialize if p + seq_length + 1 >= len(data) or n == 0: hprev_val = np.zeros([1, hidden_size]) p = 0 # reset # Prepare inputs input_vals = [char_to_ix[ch] for ch in data[p:p + seq_length]] target_vals = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]
class Trainer(): def __init__(self, log_dir, cfg): self.path = log_dir self.cfg = cfg if cfg.TRAIN.FLAG: self.model_dir = os.path.join(self.path, 'Model') self.log_dir = os.path.join(self.path, 'Log') mkdir_p(self.model_dir) mkdir_p(self.log_dir) self.writer = SummaryWriter(log_dir=self.log_dir) self.logfile = os.path.join(self.path, "logfile.log") sys.stdout = Logger(logfile=self.logfile) self.data_dir = cfg.DATASET.DATA_DIR self.max_epochs = cfg.TRAIN.MAX_EPOCHS self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL s_gpus = cfg.GPU_ID.split(',') self.gpus = [int(ix) for ix in s_gpus] self.num_gpus = len(self.gpus) self.batch_size = cfg.TRAIN.BATCH_SIZE self.lr = cfg.TRAIN.LEARNING_RATE torch.cuda.set_device(self.gpus[0]) cudnn.benchmark = True sample = cfg.SAMPLE self.dataset = [] self.dataloader = [] self.use_feats = cfg.model.use_feats eval_split = cfg.EVAL if cfg.EVAL else 'val' train_split = cfg.DATASET.train_split if cfg.DATASET.DATASET == 'clevr': clevr_collate_fn = collate_fn cogent = cfg.DATASET.COGENT if cogent: print(f'Using CoGenT {cogent.upper()}') if cfg.TRAIN.FLAG: self.dataset = ClevrDataset(data_dir=self.data_dir, split=train_split + cogent, sample=sample, **cfg.DATASET.params) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=clevr_collate_fn) self.dataset_val = ClevrDataset(data_dir=self.data_dir, split=eval_split + cogent, sample=sample, **cfg.DATASET.params) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=cfg.TEST_BATCH_SIZE, drop_last=False, shuffle=False, num_workers=cfg.WORKERS, collate_fn=clevr_collate_fn) elif cfg.DATASET.DATASET == 'gqa': if self.use_feats == 'spatial': gqa_collate_fn = collate_fn_gqa elif self.use_feats == 'objects': gqa_collate_fn = collate_fn_gqa_objs if cfg.TRAIN.FLAG: self.dataset = GQADataset(data_dir=self.data_dir, split=train_split, sample=sample, use_feats=self.use_feats, **cfg.DATASET.params) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=gqa_collate_fn) self.dataset_val = GQADataset(data_dir=self.data_dir, split=eval_split, sample=sample, use_feats=self.use_feats, **cfg.DATASET.params) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=cfg.TEST_BATCH_SIZE, shuffle=False, num_workers=cfg.WORKERS, drop_last=False, collate_fn=gqa_collate_fn) # load model self.vocab = load_vocab(cfg) self.model, self.model_ema = mac.load_MAC(cfg, self.vocab) self.weight_moving_average(alpha=0) if cfg.TRAIN.RADAM: self.optimizer = RAdam(self.model.parameters(), lr=self.lr) else: self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.start_epoch = 0 if cfg.resume_model: location = 'cuda' if cfg.CUDA else 'cpu' state = torch.load(cfg.resume_model, map_location=location) self.model.load_state_dict(state['model']) self.optimizer.load_state_dict(state['optim']) self.start_epoch = state['iter'] + 1 state = torch.load(cfg.resume_model_ema, map_location=location) self.model_ema.load_state_dict(state['model']) if cfg.start_epoch is not None: self.start_epoch = cfg.start_epoch self.previous_best_acc = 0.0 self.previous_best_epoch = 0 self.previous_best_loss = 100 self.previous_best_loss_epoch = 0 self.total_epoch_loss = 0 self.prior_epoch_loss = 10 self.print_info() self.loss_fn = torch.nn.CrossEntropyLoss().cuda() self.comet_exp = Experiment( project_name=cfg.COMET_PROJECT_NAME, api_key=os.getenv('COMET_API_KEY'), workspace=os.getenv('COMET_WORKSPACE'), disabled=cfg.logcomet is False, ) if cfg.logcomet: exp_name = cfg_to_exp_name(cfg) print(exp_name) self.comet_exp.set_name(exp_name) self.comet_exp.log_parameters(flatten_json_iterative_solution(cfg)) self.comet_exp.log_asset(self.logfile) self.comet_exp.log_asset_data(json.dumps(cfg, indent=4), file_name='cfg.json') self.comet_exp.set_model_graph(str(self.model)) if cfg.cfg_file: self.comet_exp.log_asset(cfg.cfg_file) with open(os.path.join(self.path, 'cfg.json'), 'w') as f: json.dump(cfg, f, indent=4) def print_info(self): print('Using config:') pprint.pprint(self.cfg) print("\n") pprint.pprint("Size of train dataset: {}".format(len(self.dataset))) # print("\n") pprint.pprint("Size of val dataset: {}".format(len(self.dataset_val))) print("\n") print("Using MAC-Model:") pprint.pprint(self.model) print("\n") def weight_moving_average(self, alpha=0.999): for param1, param2 in zip(self.model_ema.parameters(), self.model.parameters()): param1.data *= alpha param1.data += (1.0 - alpha) * param2.data def set_mode(self, mode="train"): if mode == "train": self.model.train() self.model_ema.train() else: self.model.eval() self.model_ema.eval() def reduce_lr(self): epoch_loss = self.total_epoch_loss # / float(len(self.dataset) // self.batch_size) lossDiff = self.prior_epoch_loss - epoch_loss if ((lossDiff < 0.015 and self.prior_epoch_loss < 0.5 and self.lr > 0.00002) or \ (lossDiff < 0.008 and self.prior_epoch_loss < 0.15 and self.lr > 0.00001) or \ (lossDiff < 0.003 and self.prior_epoch_loss < 0.10 and self.lr > 0.000005)): self.lr *= 0.5 print("Reduced learning rate to {}".format(self.lr)) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr self.prior_epoch_loss = epoch_loss self.total_epoch_loss = 0 def save_models(self, iteration): save_model(self.model, self.optimizer, iteration, self.model_dir, model_name="model") save_model(self.model_ema, None, iteration, self.model_dir, model_name="model_ema") def train_epoch(self, epoch): cfg = self.cfg total_loss = 0. total_correct = 0 total_samples = 0 self.labeled_data = iter(self.dataloader) self.set_mode("train") dataset = tqdm(self.labeled_data, total=len(self.dataloader), ncols=20) for data in dataset: ###################################################### # (1) Prepare training data ###################################################### image, question, question_len, answer = data['image'], data[ 'question'], data['question_length'], data['answer'] answer = answer.long() question = Variable(question) answer = Variable(answer) if cfg.CUDA: if self.use_feats == 'spatial': image = image.cuda() elif self.use_feats == 'objects': image = [e.cuda() for e in image] question = question.cuda() answer = answer.cuda().squeeze() else: question = question image = image answer = answer.squeeze() ############################ # (2) Train Model ############################ self.optimizer.zero_grad() scores = self.model(image, question, question_len) loss = self.loss_fn(scores, answer) loss.backward() if self.cfg.TRAIN.CLIP_GRADS: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg.TRAIN.CLIP) self.optimizer.step() self.weight_moving_average() ############################ # (3) Log Progress ############################ correct = scores.detach().argmax(1) == answer total_correct += correct.sum().cpu().item() total_loss += loss.item() * answer.size(0) total_samples += answer.size(0) avg_loss = total_loss / total_samples train_accuracy = total_correct / total_samples # accuracy = correct.sum().cpu().numpy() / answer.shape[0] # if avg_loss == 0: # avg_loss = loss.item() # train_accuracy = accuracy # else: # avg_loss = 0.99 * avg_loss + 0.01 * loss.item() # train_accuracy = 0.99 * train_accuracy + 0.01 * accuracy # self.total_epoch_loss += loss.item() * answer.size(0) dataset.set_description( 'Epoch: {}; Avg Loss: {:.5f}; Avg Train Acc: {:.5f}'.format( epoch + 1, avg_loss, train_accuracy)) self.total_epoch_loss = avg_loss dict = { "loss": avg_loss, "accuracy": train_accuracy, "avg_loss": avg_loss, # For commet "avg_accuracy": train_accuracy, # For commet } return dict def train(self): cfg = self.cfg print("Start Training") for epoch in range(self.start_epoch, self.max_epochs): with self.comet_exp.train(): dict = self.train_epoch(epoch) self.reduce_lr() dict['epoch'] = epoch + 1 dict['lr'] = self.lr self.comet_exp.log_metrics( dict, epoch=epoch + 1, ) with self.comet_exp.validate(): dict = self.log_results(epoch, dict) dict['epoch'] = epoch + 1 dict['lr'] = self.lr self.comet_exp.log_metrics( dict, epoch=epoch + 1, ) if cfg.TRAIN.EALRY_STOPPING: if epoch - cfg.TRAIN.PATIENCE == self.previous_best_epoch: # if epoch - cfg.TRAIN.PATIENCE == self.previous_best_loss_epoch: print('Early stop') break self.comet_exp.log_asset(self.logfile) self.save_models(self.max_epochs) self.writer.close() print("Finished Training") print( f"Highest validation accuracy: {self.previous_best_acc} at epoch {self.previous_best_epoch}" ) def log_results(self, epoch, dict, max_eval_samples=None): epoch += 1 self.writer.add_scalar("avg_loss", dict["loss"], epoch) self.writer.add_scalar("train_accuracy", dict["accuracy"], epoch) metrics = self.calc_accuracy("validation", max_samples=max_eval_samples) self.writer.add_scalar("val_accuracy_ema", metrics['acc_ema'], epoch) self.writer.add_scalar("val_accuracy", metrics['acc'], epoch) self.writer.add_scalar("val_loss_ema", metrics['loss_ema'], epoch) self.writer.add_scalar("val_loss", metrics['loss'], epoch) print( "Epoch: {epoch}\tVal Acc: {acc},\tVal Acc EMA: {acc_ema},\tAvg Loss: {loss},\tAvg Loss EMA: {loss_ema},\tLR: {lr}" .format(epoch=epoch, lr=self.lr, **metrics)) if metrics['acc'] > self.previous_best_acc: self.previous_best_acc = metrics['acc'] self.previous_best_epoch = epoch if metrics['loss'] < self.previous_best_loss: self.previous_best_loss = metrics['loss'] self.previous_best_loss_epoch = epoch if epoch % self.snapshot_interval == 0: self.save_models(epoch) return metrics def calc_accuracy(self, mode="train", max_samples=None): self.set_mode("validation") if mode == "train": loader = self.dataloader # elif (mode == "validation") or (mode == 'test'): # loader = self.dataloader_val else: loader = self.dataloader_val total_correct = 0 total_correct_ema = 0 total_samples = 0 total_loss = 0. total_loss_ema = 0. pbar = tqdm(loader, total=len(loader), desc=mode.upper(), ncols=20) for data in pbar: image, question, question_len, answer = data['image'], data[ 'question'], data['question_length'], data['answer'] answer = answer.long() question = Variable(question) answer = Variable(answer) if self.cfg.CUDA: if self.use_feats == 'spatial': image = image.cuda() elif self.use_feats == 'objects': image = [e.cuda() for e in image] question = question.cuda() answer = answer.cuda().squeeze() with torch.no_grad(): scores = self.model(image, question, question_len) scores_ema = self.model_ema(image, question, question_len) loss = self.loss_fn(scores, answer) loss_ema = self.loss_fn(scores_ema, answer) correct = scores.detach().argmax(1) == answer correct_ema = scores_ema.detach().argmax(1) == answer total_correct += correct.sum().cpu().item() total_correct_ema += correct_ema.sum().cpu().item() total_loss += loss.item() * answer.size(0) total_loss_ema += loss_ema.item() * answer.size(0) total_samples += answer.size(0) avg_acc = total_correct / total_samples avg_acc_ema = total_correct_ema / total_samples avg_loss = total_loss / total_samples avg_loss_ema = total_loss_ema / total_samples pbar.set_postfix({ 'Acc': f'{avg_acc:.5f}', 'Acc Ema': f'{avg_acc_ema:.5f}', 'Loss': f'{avg_loss:.5f}', 'Loss Ema': f'{avg_loss_ema:.5f}', }) return dict(acc=avg_acc, acc_ema=avg_acc_ema, loss=avg_loss, loss_ema=avg_loss_ema)
def main(cfg: DictConfig): print('Cassava Leaf Disease Classification') cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Config ------------------------------------------------------------------- data_dir = './input' seed_everything(cfg.data.seed) # Comet_ml experiment = Experiment(api_key=cfg.comet_ml.api_key, project_name=cfg.comet_ml.project_name, auto_param_logging=False, auto_metric_logging=False) # Log Parameters experiment.log_parameters(dict(cfg.data)) experiment.log_parameters(dict(cfg.train)) # Data Module --------------------------------------------------------------- transform = get_transforms(transform_name=cfg.data.transform, img_size=cfg.data.img_size) cv = StratifiedKFold(n_splits=cfg.data.n_splits, shuffle=True, random_state=cfg.data.seed) dm = CassavaDataModule(data_dir, cfg, transform, cv, use_merge=True, sample=DEBUG) # Model ---------------------------------------------------------------------- net = Timm_model(cfg.train.model_type, pretrained=True) # Log Model Graph experiment.set_model_graph(str(net)) # Loss fn --------------------------------------------------------------------- df = pd.read_csv('./input/merged.csv') weight = df['label'].value_counts().sort_index().tolist() weight = [w / len(df) for w in weight] weight = torch.tensor(weight).cuda() del df criterion = get_loss_fn(cfg.train.loss_fn, weight=weight, smoothing=0.05) # Optimizer, Scheduler -------------------------------------------------------- if cfg.train.use_sam: base_optimizer = RAdam optimizer = SAM(net.parameters(), base_optimizer, lr=cfg.train.lr, weight_decay=cfg.train.weight_decay) else: optimizer = RAdam(net.parameters(), lr=cfg.train.lr, weight_decay=cfg.train.weight_decay) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.train.epoch, eta_min=0) # Lightning Module ------------------------------------------------------------- model = CassavaLightningSystem(net, cfg, criterion=criterion, optimizer=optimizer, scheduler=scheduler, experiment=experiment) # Trainer ------------------------------------------------------------------------- trainer = Trainer( logger=False, max_epochs=cfg.train.epoch, gpus=-1, amp_backend='apex', amp_level='O2', num_sanity_val_steps=0, # Skip Sanity Check automatic_optimization=False if cfg.train.use_sam else True, # resume_from_checkpoint='./checkpoints/epoch=3-step=14047.ckpt' ) # Train trainer.fit(model, datamodule=dm)