Beispiel #1
0
class CometMLMonitor(MonitorBase):
    """
    Send data to https://www.comet.ml.

    Note:
        1. comet_ml requires you to `import comet_ml` before importing tensorflow or tensorpack.
        2. The "automatic output logging" feature of comet_ml will make the training progress bar appear to freeze.
           Therefore the feature is disabled by default.
    """
    def __init__(self, experiment=None, api_key=None, tags=None, **kwargs):
        """
        Args:
            experiment (comet_ml.Experiment): if provided, invalidate all other arguments
            api_key (str): your comet.ml API key
            tags (list[str]): experiment tags
            kwargs: other arguments passed to :class:`comet_ml.Experiment`.
        """
        if experiment is not None:
            self._exp = experiment
            assert api_key is None and tags is None and len(kwargs) == 0
        else:
            from comet_ml import Experiment
            kwargs.setdefault(
                'log_code', True
            )  # though it's not functioning, git patch logging requires it
            kwargs.setdefault('auto_output_logging', None)
            self._exp = Experiment(api_key=api_key, **kwargs)
            if tags is not None:
                self._exp.add_tags(tags)

        self._exp.set_code(
            "Code logging is impossible because there are too many files ...")
        self._exp.log_dependency('tensorpack', __git_version__)

    @property
    def experiment(self):
        """
        The :class:`comet_ml.Experiment` instance.
        """
        return self._exp

    def _before_train(self):
        self._exp.set_model_graph(tf.get_default_graph())

    @HIDE_DOC
    def process_scalar(self, name, val):
        self._exp.log_metric(name, val, step=self.global_step)

    def _after_train(self):
        self._exp.end()

    def _after_epoch(self):
        self._exp.log_epoch_end(self.epoch_num)
Beispiel #2
0
def main():
    env = gym.make('CartPole-v1')
    model = PPO()
    score = 0.0
    print_interval = 20

    log = Log(__file__[:-3])

    experiment = Experiment(api_key="F8yfdGljIExZoi73No4gb1gF5",
                            project_name="reinforcement-learning",
                            workspace="zombasy")

    experiment.set_model_graph(model)

    for n_epi in range(2000):
        s = env.reset()
        done = False
        epsilon = max(0.01, args.epsilon - 0.01 * (n_epi / 200))
        while not done:
            for t in range(args.T_horizon):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()

                coin = random.random()
                if coin < epsilon:
                    a = random.randint(0, 1)

                s_prime, r, done, info = env.step(a)

                model.put_data(
                    (s, a, r / 100.0, s_prime, prob[a].item(), done))
                s = s_prime

                score += r

                if done:
                    break

            model.train_net()

        if n_epi % print_interval == 0 and n_epi != 0:
            log.info("episode :{}, avg score : {:.1f}".format(
                n_epi, score / print_interval))
            experiment.log_metric('score', score / print_interval)
            experiment.log_metric('epsilon', epsilon)
            score = 0.0

        if n_epi % 500 == 0 and n_epi != 0:
            save_model(model, 'ppo', n_epi, experiment)

    env.close()
def train(hyper_params):
    mnist = get_data()

    # Get graph definition, tensors and ops
    train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph(
        hyper_params)

    # log parameters to Comet.ml
    import os
    # Setting the API key (saved as environment variable)
    exp = Experiment(
        api_key="<HIDDEN>",
        # or
        # api_key=os.environ.get("COMET_API_KEY"),
        project_name="prototype",
        workspace="jaimemarijke")
    exp.log_parameters(hyper_params)
    exp.log_dataset_hash(mnist)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        exp.set_model_graph(sess.graph)

        for i in range(hyper_params["steps"]):
            batch = mnist.train.next_batch(hyper_params["batch_size"])
            exp.set_step(i)
            # Compute train accuracy every 10 steps
            if i % 10 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x: batch[0],
                    y_: batch[1]
                })
                print('step %d, training accuracy %g' % (i, train_accuracy))
                exp.log_metric("acc", train_accuracy)

            # Update weights (back propagation)
            loss = train_step.run(feed_dict={x: batch[0], y_: batch[1]})
            exp.log_metric("loss", loss)

        ### Finished Training ###

        # Compute test accuracy
        acc = accuracy.eval(feed_dict={
            x: mnist.test.images,
            y_: mnist.test.labels
        })

        print('test accuracy %g' % acc)
def train(hyper_params):
    mnist = get_data()

    # Get graph definition, tensors and ops
    train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph(
        hyper_params)

    experiment = Experiment(project_name="tf")
    experiment.log_parameters(hyper_params)
    experiment.log_dataset_hash(mnist)

    with tf.Session() as sess:
        with experiment.train():
            sess.run(tf.global_variables_initializer())
            experiment.set_model_graph(sess.graph)

            for i in range(hyper_params["steps"]):
                batch = mnist.train.next_batch(hyper_params["batch_size"])
                experiment.set_step(i)
                # Compute train accuracy every 10 steps
                if i % 10 == 0:
                    train_accuracy = accuracy.eval(feed_dict={
                        x: batch[0],
                        y_: batch[1]
                    })
                    print('step %d, training accuracy %g' %
                          (i, train_accuracy))
                    experiment.log_metric("accuracy", train_accuracy, step=i)

                # Update weights (back propagation)
                _, loss_val = sess.run([train_step, cross_entropy],
                                       feed_dict={
                                           x: batch[0],
                                           y_: batch[1]
                                       })

                experiment.log_metric("loss", loss_val, step=i)

        ### Finished Training ###

        with experiment.test():
            # Compute test accuracy
            acc = accuracy.eval(feed_dict={
                x: mnist.test.images,
                y_: mnist.test.labels
            })
            experiment.log_metric("accuracy", acc)
            print('test accuracy %g' % acc)
Beispiel #5
0
class CometMlAdapter(BaseAdapter):
    def __init__(self, api_key, project_name, experiment_name):
        self.experiment = Experiment(api_key=api_key,
                                     project_name=project_name)
        self.experiment.set_name(experiment_name)

    def log_parameters(self, hyper_params):
        self.experiment.log_parameters(hyper_params)

    def set_model_graph(self, graph):
        self.experiment.set_model_graph(graph)

    def log_metric(self, name, metric, step):
        self.experiment.log_metric(name, metric, step=step)

    def register(self, name):
        pass
def train(hyper_params):
    mnist = get_data()

    # Get graph definition, tensors and ops
    train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph(
        hyper_params)

    #log parameters to Comet.ml
    exp = Experiment(api_key="YOUR-API-KEY",
                     project_name='tensorflow examples')
    exp.log_multiple_params(hyper_params)
    exp.log_dataset_hash(mnist)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        exp.set_model_graph(sess.graph)

        for i in range(hyper_params["steps"]):
            batch = mnist.train.next_batch(hyper_params["batch_size"])
            exp.set_step(i)
            # Compute train accuracy every 10 steps
            if i % 10 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x: batch[0],
                    y_: batch[1]
                })
                print('step %d, training accuracy %g' % (i, train_accuracy))
                exp.log_metric("acc", train_accuracy)

            # Update weights (back propagation)
            loss = train_step.run(feed_dict={x: batch[0], y_: batch[1]})
            exp.log_metric("loss", loss)

        ### Finished Training ###

        # Compute test accuracy
        acc = accuracy.eval(feed_dict={
            x: mnist.test.images,
            y_: mnist.test.labels
        })

        print('test accuracy %g' % acc)
                                  self.hidden_size))
        c0 = Variable(torch.zeros(self.num_layers, x.size(0),
                                  self.hidden_size))

        # Forward propagate RNN
        out, _ = self.lstm(x, (h0, c0))

        # Decode hidden state of last time step
        out = self.fc(out[:, -1, :])
        return out


rnn = RNN(hyper_params['input_size'], hyper_params['hidden_size'],
          hyper_params['num_layers'], hyper_params['num_classes'])

experiment.set_model_graph(str(rnn))

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(),
                             lr=hyper_params['learning_rate'])


def train_index_to_example(index):
    tmp, _ = train_dataset[index]
    img = tmp.numpy()[0]
    data = experiment.log_image(img, name="train_%d.png" % index)
    return {"sample": str(index), "assetId": data["imageId"]}


def test_index_to_example(index):
Beispiel #8
0
def train(rank, defparams, hyper):

    params = {}
    for param in defparams.keys():
        params[param] = defparams[param]

    hyperp = {}
    for hp in hyper.keys():
        hyperp[hp] = hyper[hp]

    experiment = Experiment(api_key="keGmeIz4GfKlQZlOP6cit4QOi",
                            project_name="hadron-shower",
                            workspace="engineren")
    experiment.add_tag(params['exp'])

    experiment.log_parameters(hyperp)

    device = torch.device("cuda")
    torch.manual_seed(params["seed"])

    world_size = int(os.environ["SLURM_NNODES"])
    rank = int(os.environ["SLURM_PROCID"])

    dist.init_process_group(backend='nccl',
                            world_size=world_size,
                            rank=rank,
                            init_method=params["DDP_init_file"])

    aD = DCGAN_D(hyperp["ndf"]).to(device)
    aG = DCGAN_G(hyperp["ngf"], hyperp["z"]).to(device)
    aE = energyRegressor().to(device)
    aP = PostProcess_Size1Conv_EcondV2(48,
                                       13,
                                       3,
                                       128,
                                       bias=True,
                                       out_funct='none').to(device)

    optimizer_g = torch.optim.Adam(aG.parameters(),
                                   lr=hyperp["L_gen"],
                                   betas=(0.5, 0.9))
    optimizer_d = torch.optim.Adam(aD.parameters(),
                                   lr=hyperp["L_crit"],
                                   betas=(0.5, 0.9))
    optimizer_e = torch.optim.SGD(aE.parameters(), lr=hyperp["L_calib"])
    optimizer_p = torch.optim.Adam(aP.parameters(),
                                   lr=hyperp["L_post"],
                                   betas=(0.5, 0.9))

    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    # Initialize Amp
    models, optimizers = amp.initialize([aG, aD], [optimizer_g, optimizer_d],
                                        opt_level="O1",
                                        num_losses=2)

    #aD = nn.DataParallel(aD)
    #aG = nn.DataParallel(aG)
    #aE = nn.DataParallel(aE)

    aG, aD = models
    optimizer_g, optimizer_d = optimizers

    aG = nn.parallel.DistributedDataParallel(aG, device_ids=[0])
    aD = nn.parallel.DistributedDataParallel(aD, device_ids=[0])
    aE = nn.parallel.DistributedDataParallel(aE, device_ids=[0])
    aP = nn.parallel.DistributedDataParallel(aP, device_ids=[0])

    experiment.set_model_graph(str(aG), overwrite=False)
    experiment.set_model_graph(str(aD), overwrite=False)

    if params["restore_pp"]:
        aP.load_state_dict(
            torch.load(params["restore_path_PP"] + params["post_saved"],
                       map_location=torch.device(device)))

    if params["restore"]:
        checkpoint = torch.load(params["restore_path"])
        aG.load_state_dict(checkpoint['Generator'])
        aD.load_state_dict(checkpoint['Critic'])
        optimizer_g.load_state_dict(checkpoint['G_optimizer'])
        optimizer_d.load_state_dict(checkpoint['D_optimizer'])
        itr = checkpoint['iteration']

    else:
        aG.apply(weights_init)
        aD.apply(weights_init)
        itr = 0

    if params["c0"]:
        aE.apply(weights_init)
    elif params["c1"]:
        aE.load_state_dict(
            torch.load(params["calib_saved"],
                       map_location=torch.device(device)))

    one = torch.tensor(1.0).to(device)
    mone = (one * -1).to(device)

    print('loading data...')
    paths_list = [
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part1.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part2.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part3.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part4.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part5.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part6.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part7.hdf5'
    ]

    train_data = PionsDataset(paths_list, core=True)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_data, num_replicas=world_size, rank=rank)

    dataloader = DataLoader(train_data,
                            batch_size=hyperp["batch_size"],
                            num_workers=0,
                            shuffle=False,
                            drop_last=True,
                            pin_memory=True,
                            sampler=train_sampler)

    print('done')

    #scheduler_g = optim.lr_scheduler.StepLR(optimizer_g, step_size=1, gamma=params["gamma_g"])
    #scheduler_d = optim.lr_scheduler.StepLR(optimizer_d, step_size=1, gamma=params["gamma_crit"])
    #scheduler_e = optim.lr_scheduler.StepLR(optimizer_e, step_size=1, gamma=params["gamma_calib"])

    #writer = SummaryWriter()

    e_criterion = nn.L1Loss()  # for energy regressor training

    dataiter = iter(dataloader)

    BATCH_SIZE = hyperp["batch_size"]
    LATENT = hyperp["z"]
    EXP = params["exp"]
    KAPPA = hyperp["kappa"]
    LAMBD = hyperp["lambda"]
    ## Post-Processing
    LDP = hyperp["LDP"]
    wMMD = hyperp["wMMD"]
    wMSE = hyperp["wMSE"]

    ## IO paths
    OUTP = params['output_path']

    for iteration in range(50000):

        iteration += itr + 1
        #---------------------TRAIN D------------------------
        for p in aD.parameters():  # reset requires_grad
            p.requires_grad_(True)  # they are set to False below in training G

        for e in aE.parameters():  # reset requires_grad (constrainer)
            e.requires_grad_(True)  # they are set to False below in training G

        for i in range(hyperp["ncrit"]):

            aD.zero_grad()
            aE.zero_grad()

            noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT))
            noise = torch.from_numpy(noise).float()
            noise = noise.view(
                -1, LATENT, 1, 1,
                1)  #[BS, nz]  --> [Bs,nz,1,1,1] Needed for Generator
            noise = noise.to(device)

            batch = next(dataiter, None)

            if batch is None:
                dataiter = iter(dataloader)
                batch = dataiter.next()

            real_label = batch['energy']  ## energy label
            real_label = real_label.to(device)

            with torch.no_grad():
                noisev = noise  # totally freeze G, training D

            fake_data = aG(noisev, real_label).detach()

            real_data = batch['shower']  # 48x48x48 calo image
            real_data = real_data.to(device)
            real_data.requires_grad_(True)

            #### supervised-training for energy regressor!
            if params["train_calib"]:
                output = aE(real_data.float())
                e_loss = e_criterion(output, real_label.view(BATCH_SIZE, 1))
                e_loss.backward()
                optimizer_e.step()

            ######

            # train with real data

            disc_real = aD(real_data.float(), real_label.float())

            # train with fake data
            fake_data = fake_data.unsqueeze(
                1)  ## transform to [BS, 1, 48, 48, 48]
            disc_fake = aD(fake_data, real_label.float())

            # train with interpolated data
            gradient_penalty = calc_gradient_penalty(aD,
                                                     real_data.float(),
                                                     fake_data,
                                                     real_label,
                                                     BATCH_SIZE,
                                                     device,
                                                     DIM=13)

            ## wasserstein-1 distace
            w_dist = torch.mean(disc_fake) - torch.mean(disc_real)
            # final disc cost
            disc_cost = torch.mean(disc_fake) - torch.mean(
                disc_real) + LAMBD * gradient_penalty

            with amp.scale_loss(disc_cost, optimizer_d) as scaled_loss:
                scaled_loss.backward()

            optimizer_d.step()

            #--------------Log to COMET ML ----------
            if i == hyperp["ncrit"] - 1:
                experiment.log_metric("L_crit", disc_cost, step=iteration)
                experiment.log_metric("gradient_pen",
                                      gradient_penalty,
                                      step=iteration)
                experiment.log_metric("Wasserstein Dist",
                                      w_dist,
                                      step=iteration)
                if params["train_calib"]:
                    experiment.log_metric("L_const", e_loss, step=iteration)

        #---------------------TRAIN G------------------------
        for p in aD.parameters():
            p.requires_grad_(False)  # freeze D

        for c in aE.parameters():
            c.requires_grad_(False)  # freeze C

        gen_cost = None
        for i in range(hyperp["ngen"]):

            aG.zero_grad()

            noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT))
            noise = torch.from_numpy(noise).float()
            noise = noise.view(
                -1, LATENT, 1, 1,
                1)  #[BS, nz]  --> [Bs,nz,1,1,1] Needed for Generator
            noise = noise.to(device)

            batch = next(dataiter, None)

            if batch is None:
                dataiter = iter(dataloader)
                batch = dataiter.next()

            real_label = batch['energy']  ## energy label
            real_label = real_label.to(device)

            noise.requires_grad_(True)

            real_data = batch['shower']  # 48x48x48 calo image
            real_data = real_data.to(device)

            fake_data = aG(noise, real_label.float())
            fake_data = fake_data.unsqueeze(
                1)  ## transform to [BS, 1, 48, 48, 48]

            ## calculate loss function
            gen_cost = aD(fake_data.float(), real_label.float())

            ## label conditioning
            #output_g = aE(fake_data)
            #output_r = aE(real_data.float())

            output_g = 0.0  #for now
            output_r = 0.0  #for now

            aux_fake = (output_g - real_label)**2
            aux_real = (output_r - real_label)**2

            aux_errG = torch.abs(aux_fake - aux_real)

            ## Total loss function for generator
            g_cost = -torch.mean(gen_cost) + KAPPA * torch.mean(aux_errG)

            with amp.scale_loss(g_cost, optimizer_g) as scaled_loss_G:
                scaled_loss_G.backward()

            optimizer_g.step()

            #--------------Log to COMET ML ----------
            experiment.log_metric("L_Gen", g_cost, step=iteration)

            ## plot example image
            if iteration % 100 == 0.0 or iteration == 1:
                image = fake_data.view(-1, 48, 13, 13).cpu().detach().numpy()
                cmap = mpl.cm.viridis
                cmap.set_bad('white', 1.)
                figExIm = plt.figure(figsize=(6, 6))
                axExIm1 = figExIm.add_subplot(1, 1, 1)
                image1 = np.sum(image[0], axis=0)
                masked_array1 = np.ma.array(image1, mask=(image1 == 0.0))
                im1 = axExIm1.imshow(masked_array1,
                                     filternorm=False,
                                     interpolation='none',
                                     cmap=cmap,
                                     vmin=0.01,
                                     vmax=100,
                                     norm=mpl.colors.LogNorm(),
                                     origin='lower')
                figExIm.patch.set_facecolor('white')
                axExIm1.set_xlabel('y [cells]', family='serif')
                axExIm1.set_ylabel('x [cells]', family='serif')
                figExIm.colorbar(im1)

                experiment.log_figure(figure=plt, figure_name="x-y")

                figExIm = plt.figure(figsize=(6, 6))
                axExIm2 = figExIm.add_subplot(1, 1, 1)
                image2 = np.sum(image[0], axis=1)
                masked_array2 = np.ma.array(image2, mask=(image2 == 0.0))
                im2 = axExIm2.imshow(masked_array2,
                                     filternorm=False,
                                     interpolation='none',
                                     cmap=cmap,
                                     vmin=0.01,
                                     vmax=100,
                                     norm=mpl.colors.LogNorm(),
                                     origin='lower')
                figExIm.patch.set_facecolor('white')
                axExIm2.set_xlabel('y [cells]', family='serif')
                axExIm2.set_ylabel('z [layers]', family='serif')
                figExIm.colorbar(im2)

                experiment.log_figure(figure=plt, figure_name="y-z")

                figExIm = plt.figure(figsize=(6, 6))
                axExIm3 = figExIm.add_subplot(1, 1, 1)
                image3 = np.sum(image[0], axis=2)
                masked_array3 = np.ma.array(image3, mask=(image3 == 0.0))
                im3 = axExIm3.imshow(masked_array3,
                                     filternorm=False,
                                     interpolation='none',
                                     cmap=cmap,
                                     vmin=0.01,
                                     vmax=100,
                                     norm=mpl.colors.LogNorm(),
                                     origin='lower')
                figExIm.patch.set_facecolor('white')
                axExIm3.set_xlabel('x [cells]', family='serif')
                axExIm3.set_ylabel('z [layers]', family='serif')
                figExIm.colorbar(im3)
                #experiment.log_metric("L_aux", aux_errG, step=iteration)
                experiment.log_figure(figure=plt, figure_name="x-z")

                ## E-sum monitoring

                figEsum = plt.figure(figsize=(6, 6 * 0.77 / 0.67))
                axEsum = figEsum.add_subplot(1, 1, 1)
                etot_real = getTotE(real_data.cpu().detach().numpy(),
                                    xbins=13,
                                    ybins=13)
                etot_fake = getTotE(image, xbins=13, ybins=13)

                axEsumReal = axEsum.hist(etot_real,
                                         bins=25,
                                         range=[0, 1500],
                                         weights=np.ones_like(etot_real) /
                                         (float(len(etot_real))),
                                         label="orig",
                                         color='blue',
                                         histtype='stepfilled')

                axEsumFake = axEsum.hist(etot_fake,
                                         bins=25,
                                         range=[0, 1500],
                                         weights=np.ones_like(etot_fake) /
                                         (float(len(etot_fake))),
                                         label="generated",
                                         color='red',
                                         histtype='stepfilled')

                axEsum.text(0.25,
                            0.81,
                            "WGAN",
                            horizontalalignment='left',
                            verticalalignment='top',
                            transform=axEsum.transAxes,
                            color='red')
                axEsum.text(0.25,
                            0.87,
                            'GEANT 4',
                            horizontalalignment='left',
                            verticalalignment='top',
                            transform=axEsum.transAxes,
                            color='blue')

                experiment.log_figure(figure=plt, figure_name="E-sum")

        #end = timer()
        #print(f'---train G elapsed time: {end - start}')

        if params["train_postP"]:
            #---------------------TRAIN P------------------------
            for p in aD.parameters():
                p.requires_grad_(False)  # freeze D

            for c in aG.parameters():
                c.requires_grad_(False)  # freeze G

            lossP = None
            for i in range(1):

                noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT))
                noise = torch.from_numpy(noise).float()
                noise = noise.view(
                    -1, LATENT, 1, 1,
                    1)  #[BS, nz]  --> [Bs,nz,1,1,1] Needed for Generator
                noise = noise.to(device)

                batch = next(dataiter, None)

                if batch is None:
                    dataiter = iter(dataloader)
                    batch = dataiter.next()

                real_label = batch['energy']  ## energy label
                real_label = real_label.to(device)

                noise.requires_grad_(True)

                real_data = batch['shower']  # calo image
                real_data = real_data.to(device)

                ## forward pass to generator
                fake_data = aG(noise, real_label.float())
                fake_data = fake_data.unsqueeze(
                    1)  ## transform to [BS, 1, layer, size, size]

                ### first LossD_P
                fake_dataP = aP(fake_data.float(), real_label.float())
                lossD_P = aD(fake_dataP.float(), real_label.float())
                lossD_P = lossD_P.mean()

                ## lossFixP

                real_sorted = real_data.view(BATCH_SIZE, -1)
                fake_sorted = fake_dataP.view(BATCH_SIZE, -1)

                real_sorted, _ = torch.sort(real_sorted,
                                            dim=1,
                                            descending=True)  #.view(900,1)
                fake_sorted, _ = torch.sort(fake_sorted,
                                            dim=1,
                                            descending=True)  #.view(900,1)

                lossFixPp1 = mmd_hit_sortKernel(real_sorted.float(),
                                                fake_sorted,
                                                kernel_size=100,
                                                stride=50,
                                                cutoff=2000,
                                                alpha=200)

                lossFixPp2 = F.mse_loss(fake_dataP.view(BATCH_SIZE, -1),
                                        fake_data.detach().view(
                                            BATCH_SIZE, -1),
                                        reduction='mean')

                lossFixP = wMMD * lossFixPp1 + wMSE * lossFixPp2

                lossP = LDP * lossD_P - lossFixP

                lossP.backward(mone)
                optimizer_p.step()

        if iteration % 100 == 0 or iteration == 1:
            print('iteration: {}, critic loss: {}'.format(
                iteration,
                disc_cost.cpu().data.numpy()))
            if rank == 0:
                torch.save(
                    {
                        'Generator': aG.state_dict(),
                        'Critic': aD.state_dict(),
                        'G_optimizer': optimizer_g.state_dict(),
                        'D_optimizer': optimizer_d.state_dict(),
                        'iteration': iteration
                    }, OUTP + '{0}/wgan_itrs_{1}.pth'.format(EXP, iteration))
                if params["train_calib"]:
                    torch.save(
                        aE.state_dict(),
                        OUTP + '/{0}/netE_itrs_{1}.pth'.format(EXP, iteration))
                if params["train_postP"]:
                    torch.save(
                        aP.state_dict(),
                        OUTP + '{0}/netP_itrs_{1}.pth'.format(EXP, iteration))
Beispiel #9
0
class CometMLMonitor(MonitorBase):
    """
    Send scalar data and the graph to https://www.comet.ml.

    Note:
        1. comet_ml requires you to `import comet_ml` before importing tensorflow or tensorpack.
        2. The "automatic output logging" feature of comet_ml will make the training progress bar appear to freeze.
           Therefore the feature is disabled by default.
    """
    def __init__(self, experiment=None, tags=None, **kwargs):
        """
        Args:
            experiment (comet_ml.Experiment): if provided, invalidate all other arguments
            tags (list[str]): experiment tags
            kwargs: arguments used to initialize :class:`comet_ml.Experiment`,
                such as project name, API key, etc.
                Refer to its documentation for details.
        """
        if experiment is not None:
            self._exp = experiment
            assert tags is None and len(kwargs) == 0
        else:
            from comet_ml import Experiment
            kwargs.setdefault(
                'log_code', True
            )  # though it's not functioning, git patch logging requires it
            kwargs.setdefault('auto_output_logging', None)
            self._exp = Experiment(**kwargs)
            if tags is not None:
                self._exp.add_tags(tags)

        self._exp.set_code("Code logging is impossible ...")
        self._exp.log_dependency('tensorpack', __git_version__)

    @property
    def experiment(self):
        """
        The :class:`comet_ml.Experiment` instance.
        """
        return self._exp

    def _before_train(self):
        self._exp.set_model_graph(tf.get_default_graph())

    @HIDE_DOC
    def process_scalar(self, name, val):
        self._exp.log_metric(name, val, step=self.global_step)

    @HIDE_DOC
    def process_image(self, name, val):
        self._exp.set_step(self.global_step)
        for idx, v in enumerate(val):
            log_name = "{}_step{}{}".format(
                name, self.global_step, "_" + str(idx) if len(val) > 1 else "")

            self._exp.log_image(v,
                                image_format="jpeg",
                                name=log_name,
                                image_minmax=(0, 255))

    def _after_train(self):
        self._exp.end()

    def _after_epoch(self):
        self._exp.log_epoch_end(self.epoch_num)
Beispiel #10
0
import model


epochs = 100000
steps_per_epoch = 0
total_images_looked_at = 0
d_steps = 1
g_steps = 1

graph, iterator, d_train_optimizer_ops, d_stabilize_optimizer_ops, g_train_optimizer_ops, g_stabilize_optimizer_ops, samples_for_all_resolutions, sizes = model.get_graph()


experiment = Experiment(api_key='<API_KEY>', project_name='art_pgan', workspace='schmidtdominik', log_code=False)
experiment.log_parameters({'G_learning_rate': model.g_learning_rate, 'D_learning_rate': model.d_learning_rate, 'D_steps': d_steps, 'G_steps': g_steps, 'batch_size': model.batch_size})
experiment.set_model_graph(graph)
experiment.set_code('\n# [code]: train.py\n' + open('train.py', 'r').read() + '\n# [code]: image_pipeline.py\n' + open('image_pipeline.py', 'r').read() + '\n# [code]: model.py\n' + open(
    'model.py', 'r').read() + '\n# [code]: discriminator.py\n' + open('discriminator.py', 'r').read() + '\n# [code]: generator.py\n' + open(
    'generator.py', 'r').read())

try: os.mkdir('./checkpoints/')
except FileExistsError: pass

try: os.mkdir('./progress_images/')
except FileExistsError: pass

current_resolution = sizes[0]
current_mode = 'train'
last_schedule_update = 0
last_schedule_update_time = time.time()
schedule_finalized = False
Beispiel #11
0
    model = model_class(**model_kwargs)
    if args.load:
        s1 = torch.load(args.load, map_location=torch.device('cpu'))
        s2 = {k.replace("module.", ""): v for k, v in s1.items()}
        model.load_state_dict(s2)

    if multi_gpu:
        model = torch_geometric.nn.DataParallel(model)

    model.to(device)

    model_fname = get_model_fname(args.dataset, model, args.n_train, args.lr, args.target)

    # need your api key in a .comet.config file: see https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables
    experiment = Experiment(project_name="particleflow", disabled=args.disable_comet)
    experiment.set_model_graph(repr(model))
    experiment.log_parameters(dict(model_kwargs, **{'model': args.model, 'lr':args.lr, 'model_fname': model_fname,
                                                    'l1': args.l1, 'l2':args.l2,
                                                    'n_train':args.n_train, 'target':args.target, 'optimizer': args.optimizer}))
    outpath = osp.join(args.outpath, model_fname)
    if osp.isdir(outpath):
        if args.overwrite:
            print("model output {} already exists, deleting it".format(outpath))
            import shutil
            shutil.rmtree(outpath)
        else:
            print("model output {} already exists, please delete it".format(outpath))
            sys.exit(0)
    try:
        os.makedirs(outpath)
    except Exception as e:
Beispiel #12
0
def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--mode',
        help='Select mode',
        choices=['train', 'test', 'demo'],
        default='train',
    )
    args = parser.parse_args()

    config = yaml.safe_load(open("config.yml"))

    if config['LOAD_MODEL']:
        model = DQN(
            in_channels=config['IN_CHANNELS'],
            out_dim=config['OUT_DIM'],
        )
        model_name = config['LOAD_MODEL']
        model.load_model(model_name)
    else:
        model = DQN(
            in_channels=config['IN_CHANNELS'],
            out_dim=config['OUT_DIM'],
        )

    if args.mode == 'test':
        test(
            device=config['DEVICE'],
            n_games=config['TEST_GAMES'],
            model=model,
            frame_skipping=config['FRAME_SKIPPING'],
        )
    elif args.mode == 'demo':
        demo(
            device=config['DEVICE'],
            model=model,
            frame_skipping=config['FRAME_SKIPPING'],
        )
    else:
        memory = ReplayMemory(capacity=config['N'])

        optimizer_name = config['OPTIMIZER']
        if optimizer_name == 'adam':
            optimizer = torch.optim.Adam(lr=config['LEARNING_RATE'],
                                         betas=(0.9, 0.999),
                                         eps=1e-8,
                                         amsgrad=False,
                                         params=model.model.parameters())
        elif optimizer_name == 'sgd':
            optimizer = torch.optim.SGD(lr=config['LEARNING_RATE'],
                                        momentum=0.9,
                                        params=model.model.parameters())
        else:
            raise ValueError(f'Unknown optimizer name: {optimizer_name}')

        experiment = Experiment(
            api_key=os.environ['COMET_ML_API_KEY'],
            project_name=config['COMET_ML_PROJECT_NAME'],
            workspace=config['COMET_ML_WORKSPACE'],
        )

        experiment.set_name(config['COMET_ML_NAME'])
        experiment.add_tag(config['COMET_ML_TAG'])
        experiment.log_parameters({
            'n_games':
            config['M'],
            'minibatch_size':
            config['MINIBATCH_SIZE'],
            'eps':
            config['EPS'],
            'eps_n_frames':
            config['EPS_N_FRAMES'],
            'gamma':
            config['GAMMA'],
            'frame_skipping':
            config['FRAME_SKIPPING'],
            'save_model_every':
            config['SAVE_MODEL_EVERY']
        })
        experiment.set_model_graph(str(model.model))

        train(
            device=config['DEVICE'],
            n_games=config['M'],
            memory=memory,
            optimizer=optimizer,
            model=model,
            experiment=experiment,
            minibatch_size=config['MINIBATCH_SIZE'],
            eps=config['EPS'],
            eps_n_frames=config['EPS_N_FRAMES'],
            gamma=config['GAMMA'],
            frame_skipping=config['FRAME_SKIPPING'],
            update_model_target_every=config['UPDATE_MODEL_TARGET_EVERY'],
            save_model_every=config['SAVE_MODEL_EVERY'],
            save_model_as=config['SAVE_MODEL_AS'],
            save_average_metrics_every=config['SAVE_AVERAGE_METRICS_EVERY'],
        )
Beispiel #13
0
class ModelTrainer:
    def __init__(self, model, dataloader, args):
        self.model = model
        self.args = args
        self.data = dataloader
        self.metric = args.metric

        if (dataloader is not None):
            self.frq_log = len(dataloader['train']) // args.frq_log

        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        model.to(self.device)

        if args.optimizer == 'sgd':
            self.optimizer = optim.SGD(model.parameters(),
                                       lr=args.lr,
                                       momentum=args.momentum,
                                       weight_decay=args.weight_decay)
        elif args.optimizer == 'adam':
            self.optimizer = optim.Adam(model.parameters(),
                                        lr=args.lr,
                                        betas=(args.beta1, 0.999),
                                        weight_decay=args.weight_decay)
        else:
            raise Exception('--optimizer should be one of {sgd, adam}')

        if args.scheduler == 'set':
            self.scheduler = optim.lr_scheduler.LambdaLR(
                self.optimizer,
                lambda epoch: 10**(epoch / args.scheduler_factor))
        elif args.scheduler == 'auto':
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer,
                mode='min',
                factor=args.scheduler_factor,
                patience=5,
                verbose=True,
                threshold=0.0001,
                threshold_mode='rel',
                cooldown=0,
                min_lr=0,
                eps=1e-08)

        self.experiment = Experiment(api_key=args.comet_key,
                                     project_name=args.comet_project,
                                     workspace=args.comet_workspace,
                                     auto_weight_logging=True,
                                     auto_metric_logging=False,
                                     auto_param_logging=False)

        self.experiment.set_name(args.name)
        self.experiment.log_parameters(vars(args))
        self.experiment.set_model_graph(str(self.model))

    def train_one_epoch(self, epoch):

        self.model.train()
        train_loader = self.data['train']
        train_loss = 0
        correct = 0

        comet_offset = epoch * len(train_loader)

        for batch_idx, (data, target) in tqdm(enumerate(train_loader),
                                              leave=True,
                                              total=len(train_loader)):
            data, target = data.to(self.device), target.to(self.device)

            self.optimizer.zero_grad()
            output = self.model(data)
            loss = F.cross_entropy(output, target, reduction='sum')
            loss.backward()
            self.optimizer.step()

            pred = output.argmax(dim=1, keepdim=True)
            acc = pred.eq(target.view_as(pred)).sum().item()
            train_loss += loss.item()
            correct += acc

            loss = loss.item() / len(data)
            acc = 100. * acc / len(data)

            comet_step = comet_offset + batch_idx
            self.experiment.log_metric('batch_loss', loss, comet_step, epoch)
            self.experiment.log_metric('batch_acc', acc, comet_step, epoch)

            if (batch_idx + 1) % self.frq_log == 0:
                self.experiment.log_metric('log_loss', loss, comet_step, epoch)
                self.experiment.log_metric('log_acc', acc, comet_step, epoch)
                print('Epoch: {} [{}/{}]\tLoss: {:.6f}\tAcc: {:.2f}%'.format(
                    epoch + 1, (batch_idx + 1) * len(data),
                    len(train_loader.dataset), loss, acc))

        train_loss /= len(train_loader.dataset)
        acc = 100. * correct / len(train_loader.dataset)

        comet_step = comet_offset + len(train_loader) - 1
        self.experiment.log_metric('loss', train_loss, comet_step, epoch)
        self.experiment.log_metric('acc', acc, comet_step, epoch)

        print(
            'Epoch: {} [Done]\tLoss: {:.4f}\tAccuracy: {}/{} ({:.2f}%)'.format(
                epoch + 1, train_loss, correct, len(train_loader.dataset),
                acc))

        return {'loss': train_loss, 'acc': acc}

    def train(self):

        self.log_cmd()
        best = -1
        history = {'lr': [], 'train_loss': []}

        try:
            print(">> Training %s" % self.model.name)
            for epoch in range(self.args.nepoch):
                with self.experiment.train():
                    train_res = self.train_one_epoch(epoch)

                with self.experiment.validate():
                    print("\nvalidation...")
                    comet_offset = (epoch + 1) * len(self.data['train']) - 1
                    res = self.val(self.data['val'], comet_offset, epoch)

                if res[self.metric] > best:
                    best = res[self.metric]
                    self.save_weights(epoch)

                if self.args.scheduler == 'set':
                    lr = self.optimizer.param_groups[0]['lr']
                    history['lr'].append(lr)
                    history['train_loss'].append(train_res['loss'])

                    self.scheduler.step(epoch + 1)
                    lr = self.optimizer.param_groups[0]['lr']
                    print('learning rate changed to: %.10f' % lr)

                elif self.args.scheduler == 'auto':
                    self.scheduler.step(train_res['loss'])
        finally:
            print(">> Training model %s. [Stopped]" % self.model.name)
            self.experiment.log_asset_folder(os.path.join(
                self.args.outf, self.args.name, 'weights'),
                                             step=None,
                                             log_file_name=False,
                                             recursive=False)
            if self.args.scheduler == 'set':
                plt.semilogx(history['lr'], history['train_loss'])
                plt.grid(True)
                self.experiment.log_figure(figure=plt)
                plt.show()

    def val(self, val_loader, comet_offset=-1, epoch=-1):
        self.model.eval()
        test_loss = 0
        correct = 0

        labels = list(range(self.args.nclass))
        cm = np.zeros((len(labels), len(labels)))

        with torch.no_grad():
            for data, target in tqdm(val_loader,
                                     leave=True,
                                     total=len(val_loader)):
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                test_loss += F.cross_entropy(output, target,
                                             reduction='sum').item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

                pred = pred.view_as(target).data.cpu().numpy()
                target = target.data.cpu().numpy()
                cm += confusion_matrix(target, pred, labels=labels)

        test_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)

        print('Evaluation: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.
              format(test_loss, correct, len(val_loader.dataset), accuracy))

        res = {'loss': test_loss, 'acc': accuracy}

        self.experiment.log_metrics(res, step=comet_offset, epoch=epoch)
        self.experiment.log_confusion_matrix(
            matrix=cm,
            labels=[ClassDict.getName(x) for x in labels],
            title='confusion matrix after epoch %03d' % epoch,
            file_name="confusion_matrix_%03d.json" % epoch)

        return res

    def test(self):
        self.load_weights()
        with self.experiment.test():
            print('\ntesting....')
            res = self.val(self.data['test'])

    def log_cmd(self):
        d = vars(self.args)
        cmd = '!python main.py \\\n'
        tab = '    '

        for k, v in d.items():
            if v is None or v == '' or (isinstance(v, bool) and v is False):
                continue

            if isinstance(v, bool):
                arg = '--{} \\\n'.format(k)
            else:
                arg = '--{} {} \\\n'.format(k, v)

            cmd = cmd + tab + arg

        # print(cmd);
        self.experiment.log_text(cmd)

    def save_weights(self, epoch: int):

        weight_dir = os.path.join(self.args.outf, self.args.name, 'weights')
        if not os.path.exists(weight_dir):
            os.makedirs(weight_dir)

        torch.save({
            'epoch': epoch,
            'state_dict': self.model.state_dict()
        }, os.path.join(weight_dir, 'model.pth'))

    def load_weights(self):

        path_g = self.args.weights_path

        if path_g is None:
            weight_dir = os.path.join(self.args.outf, self.args.name,
                                      'weights')
            path_g = os.path.join(weight_dir, 'model.pth')

        print('>> Loading weights...')
        weights_g = torch.load(path_g, map_location=self.device)['state_dict']
        self.model.load_state_dict(weights_g)
        print('   Done.')

    def predict(self, x):
        x = x / 2**15
        self.model.eval()
        with torch.no_grad():
            x = torch.from_numpy(x).float()
            x = self.transform(x)
            x = x.unsqueeze(0)
            x = self.model(x)
            x = F.softmax(x, dim=1)
            x = x.numpy()
        return x
if args.resume:
    # Resume from a snapshot
    chainer.serializers.load_npz(args.resume, trainer)

# Get confusion matrix picture before training:
log_confusion_matrix(experiment, model, trainer, 0, 0)

# Run the training
trainer.run()

# Report created images to comet.ml:
## If you want to include a graph made by chainer, you can:
#if args.plot and extensions.PlotReport.available():
#    experiment.log_image('result/loss.png')
#    experiment.log_image('result/accuracy.png')

# Report the graph, as dot language:
(graph, ) = pydot.graph_from_dot_file('result/cg.dot')
graph.write_png('result/cg.png')
experiment.log_image('result/cg.png')
with open("result/cg.dot") as fp:
    desc = fp.readlines()
    experiment.set_model_graph("\n".join(desc))

# Report a URL:
experiment.log_html_url(
    "https://github.com/chainer/chainer/"
    "blob/master/examples/mnist/train_mnist.py",
    label="This MNIST example is based on")
Beispiel #15
0
def main(cfg: DictConfig):
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    # Random Seed
    seed_everything(cfg.train.seed)

    # Model  ####################################################################
    net = ENet(model_name=cfg.train.model_name)
    transform = ImageTransform(img_size=cfg.data.img_size)

    # Comet.ml
    experiment = Experiment(api_key=cfg.comet_ml.api_key,
                            project_name=cfg.comet_ml.project_name)
    # Log Parameters
    experiment.log_parameters(dict(cfg.exp))
    experiment.log_parameters(dict(cfg.data))
    experiment.log_parameters(dict(cfg.train))
    # Log Model Graph
    experiment.set_model_graph(str(net))

    # Lightning Module  #########################################################
    model = LightningSystem(net, cfg, experiment)
    datamodule = DataModule(data_dir, cfg, transform, cv)

    checkpoint_callback = ModelCheckpoint(filepath='./checkpoint',
                                          save_top_k=1,
                                          verbose=True,
                                          monitor='avg_val_loss',
                                          mode='min',
                                          prefix=cfg.exp.exp_name + '_')

    trainer = Trainer(logger=False,
                      max_epochs=cfg.train.epoch,
                      checkpoint_callback=checkpoint_callback,
                      gpus=1)

    # Train & Test  ############################################################
    # Train
    trainer.fit(model, datamodule=datamodule)
    experiment.log_metric('best_auc', model.best_auc)
    checkpoint_path = glob.glob(f'./checkpoint/{cfg.exp.exp_name}_*.ckpt')[0]
    experiment.log_asset(file_data=checkpoint_path)

    # Test
    for i in range(test_num):
        trainer.test(model)

    # Submit
    sub_list = glob.glob(f'submission_{cfg.exp.exp_name}*.csv')
    _ = summarize_submit(sub_list,
                         experiment,
                         filename=f'sub_{cfg.exp.exp_name}.csv')

    # oof
    oof_dataset = datamodule.oof_dataset
    oof_dataloader = DataLoader(oof_dataset,
                                batch_size=cfg.train.batch_size,
                                pin_memory=False,
                                shuffle=False,
                                drop_last=False)
    for i in range(10):
        trainer.test(model, test_dataloaders=oof_dataloader)

    # Submit
    sub_list = glob.glob('submission*.csv')
    _ = summarize_submit(sub_list,
                         experiment,
                         filename=f'oof_{cfg.exp.exp_name}.csv')
# Gradient updates
updates = minimizer.apply_gradients(clipped_grads_and_vars)


def one_hot(v):
    return np.eye(vocab_size)[v]


# begin training
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

#log model graph
exp.set_model_graph(sess.graph)
# Initial values
MAXITERS = 500000
n, p = 0, 0
hprev_val = np.zeros([1, hidden_size])

while (n < MAXITERS):
    # Initialize
    if p + seq_length + 1 >= len(data) or n == 0:
        hprev_val = np.zeros([1, hidden_size])
        p = 0  # reset

    # Prepare inputs
    input_vals = [char_to_ix[ch] for ch in data[p:p + seq_length]]
    target_vals = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]
Beispiel #17
0
def main(_):
    experiment = Experiment(api_key="xXtJguCo8yFdU7dpjEpo6YbHw",
                            project_name=args.experiment_name)
    hyper_params = {
        "learning_rate": args.lr,
        "num_epochs": args.max_epoch,
        "batch_size": args.single_batch_size,
        "alpha": args.alpha,
        "beta": args.beta,
        "gamma": args.gamma,
        "loss": args.loss
    }
    experiment.log_multiple_params(hyper_params)

    # TODO: split file support
    with tf.Graph().as_default():
        global save_model_dir
        start_epoch = 0
        global_counter = 0

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
            visible_device_list=cfg.GPU_AVAILABLE,
            allow_growth=True)
        config = tf.ConfigProto(
            gpu_options=gpu_options,
            device_count={
                "GPU": cfg.GPU_USE_COUNT,
            },
            allow_soft_placement=True,
            log_device_placement=False,
        )
        with tf.Session(config=config) as sess:
            # sess=tf_debug.LocalCLIDebugWrapperSession(sess,ui_type='readline')
            model = RPN3D(cls=cfg.DETECT_OBJ,
                          single_batch_size=args.single_batch_size,
                          learning_rate=args.lr,
                          max_gradient_norm=5.0,
                          alpha=args.alpha,
                          beta=args.beta,
                          gamma=args.gamma,
                          loss_type=args.loss,
                          avail_gpus=cfg.GPU_AVAILABLE.split(','))
            # param init/restore
            if tf.train.get_checkpoint_state(save_model_dir):
                print("Reading model parameters from %s" % save_model_dir)
                model.saver.restore(sess,
                                    tf.train.latest_checkpoint(save_model_dir))
                start_epoch = model.epoch.eval() + 1
                global_counter = model.global_step.eval() + 1
            else:
                print("Created model with fresh parameters.")
                tf.global_variables_initializer().run()

            # train and validate
            is_summary, is_summary_image, is_validate = False, False, False

            summary_interval = 5
            summary_val_interval = 10
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            experiment.set_model_graph(sess.graph)

            # training
            with experiment.train():
                for epoch in range(start_epoch, args.max_epoch):
                    counter = 0
                    batch_time = time.time()
                    experiment.log_current_epoch(epoch)

                    for batch in iterate_data(
                            train_dir,
                            shuffle=True,
                            aug=True,
                            is_testset=False,
                            batch_size=args.single_batch_size *
                            cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT):

                        counter += 1
                        global_counter += 1
                        experiment.set_step(global_counter)
                        if counter % summary_interval == 0:
                            is_summary = True
                        else:
                            is_summary = False
                        epochs = args.max_epoch
                        start_time = time.time()
                        ret = model.train_step(sess,
                                               batch,
                                               train=True,
                                               summary=is_summary)
                        forward_time = time.time() - start_time
                        batch_time = time.time() - batch_time
                        param = ret
                        params = {
                            "loss": param[0],
                            "cls_loss": param[1],
                            "cls_pos_loss": param[2],
                            "cls_neg_loss": param[3]
                        }
                        experiment.log_multiple_metrics(params)
                        # print(ret)
                        print(
                            'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'
                            .format(counter, epoch, epochs, ret[0], ret[1],
                                    ret[2], ret[3], forward_time, batch_time))
                        # with open('log/train.txt', 'a') as f:
                        # f.write( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'.format(counter,epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time))

                        #print(counter, summary_interval, counter % summary_interval)
                        if counter % summary_interval == 0:
                            print("summary_interval now")
                            summary_writer.add_summary(ret[-1], global_counter)

                        #print(counter, summary_val_interval, counter % summary_val_interval)
                        if counter % summary_val_interval == 0:
                            print("summary_val_interval now")
                            batch = sample_test_data(
                                val_dir,
                                args.single_batch_size * cfg.GPU_USE_COUNT,
                                multi_gpu_sum=cfg.GPU_USE_COUNT)

                            ret = model.validate_step(sess,
                                                      batch,
                                                      summary=True)
                            summary_writer.add_summary(ret[-1], global_counter)

                            try:
                                ret = model.predict_step(sess,
                                                         batch,
                                                         summary=True)
                                summary_writer.add_summary(
                                    ret[-1], global_counter)
                            except:
                                print("prediction skipped due to error")

                        if check_if_should_pause(args.tag):
                            model.saver.save(sess,
                                             os.path.join(
                                                 save_model_dir, 'checkpoint'),
                                             global_step=model.global_step)
                            print('pause and save model @ {} steps:{}'.format(
                                save_model_dir, model.global_step.eval()))
                            sys.exit(0)

                        batch_time = time.time()
                    experiment.log_epoch_end(epoch)
                    sess.run(model.epoch_add_op)

                    model.saver.save(sess,
                                     os.path.join(save_model_dir,
                                                  'checkpoint'),
                                     global_step=model.global_step)

                    # dump test data every 10 epochs
                    if (epoch + 1) % 10 == 0:
                        # create output folder
                        os.makedirs(os.path.join(args.output_path, str(epoch)),
                                    exist_ok=True)
                        os.makedirs(os.path.join(args.output_path, str(epoch),
                                                 'data'),
                                    exist_ok=True)
                        if args.vis:
                            os.makedirs(os.path.join(args.output_path,
                                                     str(epoch), 'vis'),
                                        exist_ok=True)

                        for batch in iterate_data(
                                val_dir,
                                shuffle=False,
                                aug=False,
                                is_testset=False,
                                batch_size=args.single_batch_size *
                                cfg.GPU_USE_COUNT,
                                multi_gpu_sum=cfg.GPU_USE_COUNT):

                            if args.vis:
                                tags, results, front_images, bird_views, heatmaps = model.predict_step(
                                    sess, batch, summary=False, vis=True)
                            else:
                                tags, results = model.predict_step(
                                    sess, batch, summary=False, vis=False)

                            for tag, result in zip(tags, results):
                                of_path = os.path.join(args.output_path,
                                                       str(epoch), 'data',
                                                       tag + '.txt')
                                with open(of_path, 'w+') as f:
                                    labels = box3d_to_label(
                                        [result[:, 1:8]], [result[:, 0]],
                                        [result[:, -1]],
                                        coordinate='lidar')[0]
                                    for line in labels:
                                        f.write(line)
                                    print('write out {} objects to {}'.format(
                                        len(labels), tag))
                            # dump visualizations
                            if args.vis:
                                for tag, front_image, bird_view, heatmap in zip(
                                        tags, front_images, bird_views,
                                        heatmaps):
                                    front_img_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_front.jpg')
                                    bird_view_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_bv.jpg')
                                    heatmap_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_heatmap.jpg')
                                    cv2.imwrite(front_img_path, front_image)
                                    cv2.imwrite(bird_view_path, bird_view)
                                    cv2.imwrite(heatmap_path, heatmap)

                        # execute evaluation code
                        cmd_1 = "./kitti_eval/launch_test.sh"
                        cmd_2 = os.path.join(args.output_path, str(epoch))
                        cmd_3 = os.path.join(args.output_path, str(epoch),
                                             'log')
                        os.system(" ".join([cmd_1, cmd_2, cmd_3]))

            print('train done. total epoch:{} iter:{}'.format(
                epoch, model.global_step.eval()))

            # finallly save model
            model.saver.save(sess,
                             os.path.join(save_model_dir, 'checkpoint'),
                             global_step=model.global_step)
Beispiel #18
0
def main():
    opt = parse_option()

    print(pp.pformat(vars(opt)))

    train_partition = "trainval" if opt.use_trainval else "train"
    if opt.dataset == "miniImageNet":
        train_trans, test_trans = transforms_options[opt.transform]

        if opt.augment == "none":
            train_train_trans = train_test_trans = test_trans
        elif opt.augment == "all":
            train_train_trans = train_test_trans = train_trans
        elif opt.augment == "spt":
            train_train_trans = train_trans
            train_test_trans = test_trans
        elif opt.augment == "qry":
            train_train_trans = test_trans
            train_test_trans = train_trans

        print("spt trans")
        print(train_train_trans)
        print("qry trans")
        print(train_test_trans)

        sub_batch_size, rmd = divmod(opt.batch_size, opt.apply_every)
        assert rmd == 0
        print("Train sub batch-size:", sub_batch_size)

        meta_train_dataset = MetaImageNet(
            args=opt,
            partition="train",
            train_transform=train_train_trans,
            test_transform=train_test_trans,
            fname="miniImageNet_category_split_train_phase_%s.pickle",
            fix_seed=False,
            n_test_runs=10000000,  # big number to never stop
            new_labels=False,
        )
        meta_trainloader = DataLoader(
            meta_train_dataset,
            batch_size=sub_batch_size,
            shuffle=True,
            drop_last=True,
            num_workers=opt.num_workers,
            pin_memory=True,
        )
        meta_train_dataset_qry = MetaImageNet(
            args=opt,
            partition="train",
            train_transform=train_train_trans,
            test_transform=train_test_trans,
            fname="miniImageNet_category_split_train_phase_%s.pickle",
            fix_seed=False,
            n_test_runs=10000000,  # big number to never stop
            new_labels=False,
            n_ways=opt.n_qry_way,
            n_shots=opt.n_qry_shot,
            n_queries=0,
        )
        meta_trainloader_qry = DataLoader(
            meta_train_dataset_qry,
            batch_size=sub_batch_size,
            shuffle=True,
            drop_last=True,
            num_workers=opt.num_workers,
            pin_memory=True,
        )
        meta_val_dataset = MetaImageNet(
            args=opt,
            partition="val",
            train_transform=test_trans,
            test_transform=test_trans,
            fix_seed=False,
            n_test_runs=200,
            n_ways=5,
            n_shots=5,
            n_queries=15,
        )
        meta_valloader = DataLoader(
            meta_val_dataset,
            batch_size=opt.test_batch_size,
            shuffle=False,
            drop_last=False,
            num_workers=opt.num_workers,
            pin_memory=True,
        )
        val_loader = DataLoader(
            ImageNet(args=opt, partition="val", transform=test_trans),
            batch_size=opt.sup_val_batch_size,
            shuffle=False,
            drop_last=False,
            num_workers=opt.num_workers,
            pin_memory=True,
        )
        # if opt.use_trainval:
        #     n_cls = 80
        # else:
        #     n_cls = 64
        n_cls = len(meta_train_dataset.classes)

    print(n_cls)

    # x_spt, y_spt, x_qry, y_qry = next(iter(meta_trainloader))

    # x_spt2, y_spt2, x_qry2, y_qry2 = next(iter(meta_trainloader_qry))

    # print(x_spt, y_spt, x_qry, y_qry)
    # print(x_spt2, y_spt2, x_qry2, y_qry2)
    # print(x_spt.shape, y_spt.shape, x_qry.shape, y_qry.shape)
    # print(x_spt2.shape, y_spt2.shape, x_qry2.shape, y_qry2.shape)

    model = create_model(
        opt.model,
        n_cls,
        opt.dataset,
        opt.drop_rate,
        opt.dropblock,
        opt.track_stats,
        opt.initializer,
        opt.weight_norm,
        activation=opt.activation,
        normalization=opt.normalization,
    )

    print(model)

    criterion = nn.CrossEntropyLoss()

    if torch.cuda.is_available():
        print(torch.cuda.get_device_name())
        device = torch.device("cuda")
        # if opt.n_gpu > 1:
        #     model = nn.DataParallel(model)
        model = model.to(device)
        criterion = criterion.to(device)
        cudnn.benchmark = True
    else:
        device = torch.device("cpu")

    print("Learning rate")
    print(opt.learning_rate)
    print("Inner Learning rate")
    print(opt.inner_lr)
    if opt.learn_lr:
        print("Optimizing learning rate")
    inner_lr = nn.Parameter(torch.tensor(opt.inner_lr),
                            requires_grad=opt.learn_lr)
    optimizer = torch.optim.Adam(
        list(model.parameters()) +
        [inner_lr] if opt.learn_lr else model.parameters(),
        lr=opt.learning_rate,
    )
    # classifier = model.classifier()
    inner_opt = torch.optim.SGD(
        model.classifier.parameters(),
        lr=opt.inner_lr,
    )
    logger = SummaryWriter(logdir=opt.tb_folder,
                           flush_secs=10,
                           comment=opt.model_name)
    comet_logger = Experiment(
        api_key=os.environ["COMET_API_KEY"],
        project_name=opt.comet_project_name,
        workspace=opt.comet_workspace,
        disabled=not opt.logcomet,
        auto_metric_logging=False,
    )
    comet_logger.set_name(opt.model_name)
    comet_logger.log_parameters(vars(opt))
    comet_logger.set_model_graph(str(model))

    if opt.cosine:
        eta_min = opt.learning_rate * opt.cosine_factor
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, opt.num_steps, eta_min, -1)

    # routine: supervised pre-training
    data_sampler = iter(meta_trainloader)
    data_sampler_qry = iter(meta_trainloader_qry)
    pbar = tqdm(
        range(1, opt.num_steps + 1),
        miniters=opt.print_freq,
        mininterval=3,
        maxinterval=30,
        ncols=0,
    )
    best_val_acc = 0.0
    for step in pbar:

        if not opt.cosine:
            adjust_learning_rate(step, opt, optimizer)
        # print("==> training...")

        time1 = time.time()

        foa = 0.0
        fol = 0.0
        ioa = 0.0
        iil = 0.0
        fil = 0.0
        iia = 0.0
        fia = 0.0
        for j in range(opt.apply_every):

            x_spt, y_spt, x_qry, y_qry = [
                t.to(device) for t in next(data_sampler)
            ]
            x_qry2, y_qry2, _, _ = [
                t.to(device) for t in next(data_sampler_qry)
            ]
            y_spt = y_spt.flatten(1)
            y_qry2 = y_qry2.flatten(1)

            x_qry = torch.cat((x_spt, x_qry, x_qry2), 1)
            y_qry = torch.cat((y_spt, y_qry, y_qry2), 1)

            if step == 1 and j == 0:
                print(x_spt.size(), y_spt.size(), x_qry.size(), y_qry.size())

            info = train_step(
                model,
                model.classifier,
                None,
                # inner_opt,
                inner_lr,
                x_spt,
                y_spt,
                x_qry,
                y_qry,
                reset_head=opt.reset_head,
                num_steps=opt.num_inner_steps,
            )

            _foa = info["foa"] / opt.batch_size
            _fol = info["fol"] / opt.batch_size
            _ioa = info["ioa"] / opt.batch_size
            _iil = info["iil"] / opt.batch_size
            _fil = info["fil"] / opt.batch_size
            _iia = info["iia"] / opt.batch_size
            _fia = info["fia"] / opt.batch_size

            _fol.backward()

            foa += _foa.detach()
            fol += _fol.detach()
            ioa += _ioa.detach()
            iil += _iil.detach()
            fil += _fil.detach()
            iia += _iia.detach()
            fia += _fia.detach()

        optimizer.step()
        optimizer.zero_grad()
        inner_lr.data.clamp_(min=0.001)

        if opt.cosine:
            scheduler.step()
        if (step == 1) or (step % opt.eval_freq == 0):
            val_info = test_run(
                iter(meta_valloader),
                model,
                model.classifier,
                torch.optim.SGD(model.classifier.parameters(),
                                lr=inner_lr.item()),
                num_inner_steps=opt.num_inner_steps_test,
                device=device,
            )
            val_acc_feat, val_std_feat = meta_test(
                model,
                meta_valloader,
                use_logit=False,
            )

            val_acc = val_info["outer"]["acc"].cpu()
            val_loss = val_info["outer"]["loss"].cpu()

            sup_acc, sup_acc_top5, sup_loss = validate(
                val_loader,
                model,
                criterion,
                print_freq=100000000,
            )
            sup_acc = sup_acc.item()
            sup_acc_top5 = sup_acc_top5.item()

            print(f"\nValidation step {step}")
            print(f"MAML 5-way-5-shot accuracy: {val_acc.item()}")
            print(f"LR 5-way-5-shot accuracy: {val_acc_feat}+-{val_std_feat}")
            print(
                f"Supervised accuracy: Acc@1: {sup_acc} Acc@5: {sup_acc_top5} Loss: {sup_loss}"
            )

            if val_acc_feat > best_val_acc:
                best_val_acc = val_acc_feat
                print(
                    f"New best validation accuracy {best_val_acc.item()} saving checkpoints\n"
                )
                # print(val_acc.item())

                torch.save(
                    {
                        "opt":
                        opt,
                        "model":
                        model.state_dict()
                        if opt.n_gpu <= 1 else model.module.state_dict(),
                        "optimizer":
                        optimizer.state_dict(),
                        "step":
                        step,
                        "val_acc":
                        val_acc,
                        "val_loss":
                        val_loss,
                        "val_acc_lr":
                        val_acc_feat,
                        "sup_acc":
                        sup_acc,
                        "sup_acc_top5":
                        sup_acc_top5,
                        "sup_loss":
                        sup_loss,
                    },
                    os.path.join(opt.save_folder,
                                 "{}_best.pth".format(opt.model)),
                )

            comet_logger.log_metrics(
                dict(
                    fol=val_loss,
                    foa=val_acc,
                    acc_lr=val_acc_feat,
                    sup_acc=sup_acc,
                    sup_acc_top5=sup_acc_top5,
                    sup_loss=sup_loss,
                ),
                step=step,
                prefix="val",
            )

            logger.add_scalar("val_acc", val_acc, step)
            logger.add_scalar("val_loss", val_loss, step)
            logger.add_scalar("val_acc_lr", val_acc_feat, step)
            logger.add_scalar("sup_acc", sup_acc, step)
            logger.add_scalar("sup_acc_top5", sup_acc_top5, step)
            logger.add_scalar("sup_loss", sup_loss, step)

        if (step == 1) or (step % opt.eval_freq == 0) or (step % opt.print_freq
                                                          == 0):

            tfol = fol.cpu()
            tfoa = foa.cpu()
            tioa = ioa.cpu()
            tiil = iil.cpu()
            tfil = fil.cpu()
            tiia = iia.cpu()
            tfia = fia.cpu()

            comet_logger.log_metrics(
                dict(
                    fol=tfol,
                    foa=tfoa,
                    ioa=tfoa,
                    iil=tiil,
                    fil=tfil,
                    iia=tiia,
                    fia=tfia,
                ),
                step=step,
                prefix="train",
            )

            logger.add_scalar("train_acc", tfoa.item(), step)
            logger.add_scalar("train_loss", tfol.item(), step)
            logger.add_scalar("train_ioa", tioa, step)
            logger.add_scalar("train_iil", tiil, step)
            logger.add_scalar("train_fil", tfil, step)
            logger.add_scalar("train_iia", tiia, step)
            logger.add_scalar("train_fia", tfia, step)

            pbar.set_postfix(
                # iol=f"{info['iol'].item():.2f}",
                fol=f"{tfol.item():.2f}",
                # ioa=f"{info['ioa'].item():.2f}",
                foa=f"{tfoa.item():.2f}",
                ioa=f"{tioa.item():.2f}",
                iia=f"{tiia.item():.2f}",
                fia=f"{tfia.item():.2f}",
                vl=f"{val_loss.item():.2f}",
                va=f"{val_acc.item():.2f}",
                valr=f"{val_acc_feat:.2f}",
                lr=f"{inner_lr.item():.4f}",
                vsa=f"{sup_acc:.2f}",
                # iil=f"{info['iil'].item():.2f}",
                # fil=f"{info['fil'].item():.2f}",
                # iia=f"{info['iia'].item():.2f}",
                # fia=f"{info['fia'].item():.2f}",
                # counter=info["counter"],
                refresh=True,
            )

    # save the last model
    state = {
        "opt":
        opt,
        "model":
        model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(),
        "optimizer":
        optimizer.state_dict(),
        "step":
        step,
    }
    save_file = os.path.join(opt.save_folder, "{}_last.pth".format(opt.model))
    torch.save(state, save_file)
Beispiel #19
0
def train(defparams, hyper):
    
    params = {}
    for param in defparams.keys():
        params[param] = defparams[param]

    hyperp = {}
    for hp in hyper.keys():
        hyperp[hp] = hyper[hp]

    experiment = Experiment(api_key="keGmeIz4GfKlQZlOP6cit4QOi",
                        project_name="hadron-shower", workspace="engineren")
    experiment.add_tag(params['exp'])

    experiment.log_parameters(hyperp)


    device = torch.device("cuda")       
    torch.manual_seed(params["seed"])

       
    aD = DCGAN_D(hyperp["ndf"]).to(device)
    aG = DCGAN_G(hyperp["ngf"], hyperp["z"]).to(device)
    aE = energyRegressor().to(device)
    #aP = PostProcess_Size1Conv_EcondV2(30, 3, 128, bias=True, out_funct='none').to(device)

    experiment.set_model_graph(str(aG))
    experiment.set_model_graph(str(aD))

    ## no need for post processing now
    #if params["restore_pp"]:
    #    aP.load_state_dict(torch.load(params["restore_path_PP"] + params["post_saved"], map_location=torch.device(device)))

    if params["restore"]:   
        aG.load_state_dict(torch.load(params["restore_path"] + params["gen_saved"], map_location=torch.device(device)))
        aD.load_state_dict(torch.load(params["restore_path"] + params["crit_saved"], map_location=torch.device(device)))
        
    else:
        aG.apply(weights_init)
        aD.apply(weights_init)

    if params["c0"]: 
        aE.apply(weights_init)
    elif params["c1"] :
        aE.load_state_dict(torch.load(params["calib_saved"], map_location=torch.device(device)))
    
    
    
    one = torch.tensor(1.0).to(device)
    mone = (one * -1).to(device)



    print('loading data...')
    paths_list = [
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part1.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part2.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part3.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part4.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part5.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part6.hdf5',
        '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part7.hdf5'
    ]


    train_data = PionsDataset(paths_list)
    dataloader = DataLoader(train_data, shuffle=True, batch_size=hyperp["batch_size"], num_workers=10)
    print('done')


    optimizer_g = torch.optim.Adam(aG.parameters(), lr=hyperp["L_gen"], betas=(0.5, 0.9))
    optimizer_d = torch.optim.Adam(aD.parameters(), lr=hyperp["L_crit"], betas=(0.5, 0.9))
    optimizer_e = torch.optim.SGD(aE.parameters(), lr=hyperp["L_calib"])
    #optimizer_p = torch.optim.Adam(aP.parameters(), lr=params["L_post"], betas=(0.5, 0.9))


    #scheduler_g = optim.lr_scheduler.StepLR(optimizer_g, step_size=1, gamma=params["gamma_g"])
    #scheduler_d = optim.lr_scheduler.StepLR(optimizer_d, step_size=1, gamma=params["gamma_crit"])
    #scheduler_e = optim.lr_scheduler.StepLR(optimizer_e, step_size=1, gamma=params["gamma_calib"])
   
    
    #writer = SummaryWriter()

    e_criterion = nn.L1Loss() # for energy regressor training

    dataiter = iter(dataloader)
    
    BATCH_SIZE = hyperp["batch_size"]
    LATENT = hyperp["z"]
    EXP = params["exp"]
    KAPPA = hyperp["kappa"]
    LAMBD = hyperp["lambda"]
    ## Post-Processing 
    LDP = hyperp["LDP"]
    wMMD = hyperp["wMMD"]
    wMSE = hyperp["wMSE"]

    ## IO paths
    OUTP = params['output_path']

    for iteration in range(1, 75000):
        
        #---------------------TRAIN D------------------------
        for p in aD.parameters():  # reset requires_grad
            p.requires_grad_(True)  # they are set to False below in training G
        
        for e in aE.parameters():  # reset requires_grad (constrainer)
            e.requires_grad_(True)  # they are set to False below in training G


        for i in range(hyperp["ncrit"]):
            
            aD.zero_grad()
            aE.zero_grad()
            
            noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT))    
            noise = torch.from_numpy(noise).float()
            noise = noise.view(-1, LATENT, 1, 1, 1)    #[BS, nz]  --> [Bs,nz,1,1,1] Needed for Generator
            noise = noise.to(device)
            
            batch = next(dataiter, None)

            if batch is None:
                dataiter = iter(dataloader)
                batch = dataiter.next()

            real_label = batch['energy'] ## energy label
            real_label = real_label.to(device)
           

            with torch.no_grad():
                noisev = noise  # totally freeze G, training D
            
            fake_data = aG(noisev, real_label).detach()
            

            real_data = batch['shower'] # 48x48x48 calo image
            real_data = real_data.to(device)
            real_data.requires_grad_(True)

        
            

            #### supervised-training for energy regressor!
            if params["train_calib"] :
                output = aE(real_data.float())
                e_loss = e_criterion(output, real_label.view(BATCH_SIZE, 1))
                e_loss.backward()
                optimizer_e.step()

            ######

            # train with real data
            
            disc_real = aD(real_data.float(), real_label.float())
        

            # train with fake data
            fake_data = fake_data.unsqueeze(1)  ## transform to [BS, 1, 30, 30, 30]
            disc_fake = aD(fake_data, real_label.float())

            
            # train with interpolated data
            gradient_penalty = calc_gradient_penalty(aD, real_data, fake_data, real_label, BATCH_SIZE, device, DIM=48)
            
            ## wasserstein-1 distace
            w_dist = torch.mean(disc_fake) - torch.mean(disc_real)
            # final disc cost
            disc_cost = torch.mean(disc_fake) - torch.mean(disc_real) + LAMBD * gradient_penalty
            disc_cost.backward()
            optimizer_d.step()
            
            #--------------Log to COMET ML ----------
            if i == hyperp["ncrit"]-1:
                experiment.log_metric("L_crit", disc_cost, step=iteration)
                experiment.log_metric("gradient_pen", gradient_penalty, step=iteration)
                experiment.log_metric("Wasserstein Dist", w_dist, step=iteration)
                if params["train_calib"]:
                    experiment.log_metric("L_const", e_loss, step=iteration)
        
        #---------------------TRAIN G------------------------
        for p in aD.parameters():
            p.requires_grad_(False)  # freeze D
        
        for c in aE.parameters():
            c.requires_grad_(False)  # freeze C

        gen_cost = None
        for i in range(hyperp["ngen"]):
            
            aG.zero_grad()
            
            
            noise = np.random.uniform(-1,1, (BATCH_SIZE, LATENT))
            noise = torch.from_numpy(noise).float()
            noise = noise.view(-1, LATENT, 1, 1, 1) #[BS, nz]  --> [Bs,nz,1,1,1] Needed for Generator
            noise = noise.to(device)


            batch = next(dataiter, None)

            if batch is None:
                dataiter = iter(dataloader)
                batch = dataiter.next()
            
            real_label = batch['energy'] ## energy label
            real_label = real_label.to(device)
            
            
            noise.requires_grad_(True)

            
            real_data = batch['shower'] # 48x48x48 calo image
            real_data = real_data.to(device)

            fake_data = aG(noise, real_label.float())
            fake_data = fake_data.unsqueeze(1)  ## transform to [BS, 1, 30, 30, 30]
            
            
            ## calculate loss function 
            gen_cost = aD(fake_data.float(), real_label.float())
            
            ## label conditioning
            output_g = aE(fake_data)
            output_r = aE(real_data.float())


            aux_fake = (output_g - real_label)**2
            aux_real = (output_r - real_label)**2
            
            aux_errG = torch.abs(aux_fake - aux_real)
            
            ## Total loss function for generator
            g_cost = -torch.mean(gen_cost) + KAPPA*torch.mean(aux_errG) 
            g_cost.backward()
            optimizer_g.step()

            #--------------Log to COMET ML ----------
            experiment.log_metric("L_Gen", g_cost, step=iteration)
            #experiment.log_metric("L_aux", aux_errG, step=iteration)

        #end = timer()
        #print(f'---train G elapsed time: {end - start}')

        if params["train_postP"]:
            #---------------------TRAIN P------------------------
            for p in aD.parameters():
                p.requires_grad_(False)  # freeze D
            
            for c in aG.parameters():
                c.requires_grad_(False)  # freeze G

            lossP = None
            for i in range(1):
                
                noise = np.random.uniform(-1,1, (BATCH_SIZE, LATENT))
                noise = torch.from_numpy(noise).float()
                noise = noise.view(-1, LATENT, 1, 1, 1) #[BS, nz]  --> [Bs,nz,1,1,1] Needed for Generator
                noise = noise.to(device)


                batch = next(dataiter, None)

                if batch is None:
                    dataiter = iter(dataloader)
                    batch = dataiter.next()

                real_label = batch[1] ## energy label
                real_label = real_label.unsqueeze(-1)  ## transform to [Bs, 1 ]
                real_label = real_label.to(device)
                real_label = real_label.view(-1, 1, 1, 1, 1)  #[BS,1] ---> [BS,1,1,1,1]  Needed for Generator
                noise.requires_grad_(True)

                real_data = batch[0] # 30x30x30 calo layers
                real_data = real_data.unsqueeze(1) #transform to [Bs, 1, 30, 30, 30 ]
                real_data = real_data.to(device)

                fake_data = aG(noise, real_label.float())
                            
                real_label = real_label.view(BATCH_SIZE, 1)   ## transform back : [BS,1,1,1,1]  -- > [BS,1]
                fake_data = fake_data.unsqueeze(1)  ## transform to [BS, 1, 30, 30, 30]
                
                ### first LossD_P
                fake_dataP = aP(fake_data.float(), real_label.float())
                lossD_P = aD(fake_dataP.float(), real_label.float())
                lossD_P = lossD_P.mean()

                ## lossFixP

                real_sorted = real_data.view(BATCH_SIZE, -1)
                fake_sorted = fake_dataP.view(BATCH_SIZE, -1)
                
                real_sorted, _ = torch.sort(real_sorted, dim=1, descending=True) #.view(900,1)
                fake_sorted, _ = torch.sort(fake_sorted, dim=1, descending=True) #.view(900,1)

                lossFixPp1 = mmd_hit_sortKernel(real_sorted.float(), fake_sorted, kernel_size=100, stride=50, cutoff=2000, alpha=200) 
                
                
                lossFixPp2 = F.mse_loss(fake_dataP.view(BATCH_SIZE, -1), 
                                        fake_data.detach().view(BATCH_SIZE, -1), reduction='mean')
                
                lossFixP = wMMD*lossFixPp1 + wMSE*lossFixPp2

                lossP = LDP*lossD_P - lossFixP

                lossP.backward(mone)            
                optimizer_p.step()



        
       

        #if params["train_postP"]:
        #    writer.add_scalar('data/lossD_P', lossD_P.mean(), iteration)
        #    writer.add_scalar('data/lossMSE', lossFixPp2.mean(), iteration)
        #    writer.add_scalar('data/lossMMD', lossFixPp1.mean(), iteration)
    

        if iteration % 1000==999 or iteration == 1 :
            print ('iteration: {}, critic loss: {}'.format(iteration, disc_cost.cpu().data.numpy()) )
            torch.save(aG.state_dict(), OUTP+'{0}/netG_itrs_{1}.pth'.format(EXP, iteration))
            torch.save(aD.state_dict(), OUTP+'{0}/netD_itrs_{1}.pth'.format(EXP, iteration))
            if params["train_calib"] :
                torch.save(aE.state_dict(), OUTP+'/{0}/netE_itrs_{1}.pth'.format(EXP, iteration))
            if params["train_postP"]:
                torch.save(aP.state_dict(), OUTP+'{0}/netP_itrs_{1}.pth'.format(EXP, iteration))
Beispiel #20
0
# Gradient updates
updates = minimizer.apply_gradients(clipped_grads_and_vars)


def one_hot(v):
    return np.eye(vocab_size)[v]


# begin training
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

# log model graph
experiment.set_model_graph(sess.graph)
# Initial values
MAXITERS = 500000
n, p = 0, 0
hprev_val = np.zeros([1, hidden_size])

while (n < MAXITERS):
    # Initialize
    if p + seq_length + 1 >= len(data) or n == 0:
        hprev_val = np.zeros([1, hidden_size])
        p = 0  # reset

    # Prepare inputs
    input_vals = [char_to_ix[ch] for ch in data[p:p + seq_length]]
    target_vals = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]
Beispiel #21
0
class Trainer():
    def __init__(self, log_dir, cfg):

        self.path = log_dir
        self.cfg = cfg

        if cfg.TRAIN.FLAG:
            self.model_dir = os.path.join(self.path, 'Model')
            self.log_dir = os.path.join(self.path, 'Log')
            mkdir_p(self.model_dir)
            mkdir_p(self.log_dir)
            self.writer = SummaryWriter(log_dir=self.log_dir)
            self.logfile = os.path.join(self.path, "logfile.log")
            sys.stdout = Logger(logfile=self.logfile)

        self.data_dir = cfg.DATASET.DATA_DIR
        self.max_epochs = cfg.TRAIN.MAX_EPOCHS
        self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL

        s_gpus = cfg.GPU_ID.split(',')
        self.gpus = [int(ix) for ix in s_gpus]
        self.num_gpus = len(self.gpus)

        self.batch_size = cfg.TRAIN.BATCH_SIZE
        self.lr = cfg.TRAIN.LEARNING_RATE

        torch.cuda.set_device(self.gpus[0])
        cudnn.benchmark = True

        sample = cfg.SAMPLE
        self.dataset = []
        self.dataloader = []
        self.use_feats = cfg.model.use_feats
        eval_split = cfg.EVAL if cfg.EVAL else 'val'
        train_split = cfg.DATASET.train_split
        if cfg.DATASET.DATASET == 'clevr':
            clevr_collate_fn = collate_fn
            cogent = cfg.DATASET.COGENT
            if cogent:
                print(f'Using CoGenT {cogent.upper()}')

            if cfg.TRAIN.FLAG:
                self.dataset = ClevrDataset(data_dir=self.data_dir,
                                            split=train_split + cogent,
                                            sample=sample,
                                            **cfg.DATASET.params)
                self.dataloader = DataLoader(dataset=self.dataset,
                                             batch_size=cfg.TRAIN.BATCH_SIZE,
                                             shuffle=True,
                                             num_workers=cfg.WORKERS,
                                             drop_last=True,
                                             collate_fn=clevr_collate_fn)

            self.dataset_val = ClevrDataset(data_dir=self.data_dir,
                                            split=eval_split + cogent,
                                            sample=sample,
                                            **cfg.DATASET.params)
            self.dataloader_val = DataLoader(dataset=self.dataset_val,
                                             batch_size=cfg.TEST_BATCH_SIZE,
                                             drop_last=False,
                                             shuffle=False,
                                             num_workers=cfg.WORKERS,
                                             collate_fn=clevr_collate_fn)

        elif cfg.DATASET.DATASET == 'gqa':
            if self.use_feats == 'spatial':
                gqa_collate_fn = collate_fn_gqa
            elif self.use_feats == 'objects':
                gqa_collate_fn = collate_fn_gqa_objs
            if cfg.TRAIN.FLAG:
                self.dataset = GQADataset(data_dir=self.data_dir,
                                          split=train_split,
                                          sample=sample,
                                          use_feats=self.use_feats,
                                          **cfg.DATASET.params)
                self.dataloader = DataLoader(dataset=self.dataset,
                                             batch_size=cfg.TRAIN.BATCH_SIZE,
                                             shuffle=True,
                                             num_workers=cfg.WORKERS,
                                             drop_last=True,
                                             collate_fn=gqa_collate_fn)

            self.dataset_val = GQADataset(data_dir=self.data_dir,
                                          split=eval_split,
                                          sample=sample,
                                          use_feats=self.use_feats,
                                          **cfg.DATASET.params)
            self.dataloader_val = DataLoader(dataset=self.dataset_val,
                                             batch_size=cfg.TEST_BATCH_SIZE,
                                             shuffle=False,
                                             num_workers=cfg.WORKERS,
                                             drop_last=False,
                                             collate_fn=gqa_collate_fn)

        # load model
        self.vocab = load_vocab(cfg)
        self.model, self.model_ema = mac.load_MAC(cfg, self.vocab)

        self.weight_moving_average(alpha=0)
        if cfg.TRAIN.RADAM:
            self.optimizer = RAdam(self.model.parameters(), lr=self.lr)
        else:
            self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        self.start_epoch = 0
        if cfg.resume_model:
            location = 'cuda' if cfg.CUDA else 'cpu'
            state = torch.load(cfg.resume_model, map_location=location)
            self.model.load_state_dict(state['model'])
            self.optimizer.load_state_dict(state['optim'])
            self.start_epoch = state['iter'] + 1
            state = torch.load(cfg.resume_model_ema, map_location=location)
            self.model_ema.load_state_dict(state['model'])

        if cfg.start_epoch is not None:
            self.start_epoch = cfg.start_epoch

        self.previous_best_acc = 0.0
        self.previous_best_epoch = 0
        self.previous_best_loss = 100
        self.previous_best_loss_epoch = 0

        self.total_epoch_loss = 0
        self.prior_epoch_loss = 10

        self.print_info()
        self.loss_fn = torch.nn.CrossEntropyLoss().cuda()

        self.comet_exp = Experiment(
            project_name=cfg.COMET_PROJECT_NAME,
            api_key=os.getenv('COMET_API_KEY'),
            workspace=os.getenv('COMET_WORKSPACE'),
            disabled=cfg.logcomet is False,
        )
        if cfg.logcomet:
            exp_name = cfg_to_exp_name(cfg)
            print(exp_name)
            self.comet_exp.set_name(exp_name)
            self.comet_exp.log_parameters(flatten_json_iterative_solution(cfg))
            self.comet_exp.log_asset(self.logfile)
            self.comet_exp.log_asset_data(json.dumps(cfg, indent=4),
                                          file_name='cfg.json')
            self.comet_exp.set_model_graph(str(self.model))
            if cfg.cfg_file:
                self.comet_exp.log_asset(cfg.cfg_file)

        with open(os.path.join(self.path, 'cfg.json'), 'w') as f:
            json.dump(cfg, f, indent=4)

    def print_info(self):
        print('Using config:')
        pprint.pprint(self.cfg)
        print("\n")

        pprint.pprint("Size of train dataset: {}".format(len(self.dataset)))
        # print("\n")
        pprint.pprint("Size of val dataset: {}".format(len(self.dataset_val)))
        print("\n")

        print("Using MAC-Model:")
        pprint.pprint(self.model)
        print("\n")

    def weight_moving_average(self, alpha=0.999):
        for param1, param2 in zip(self.model_ema.parameters(),
                                  self.model.parameters()):
            param1.data *= alpha
            param1.data += (1.0 - alpha) * param2.data

    def set_mode(self, mode="train"):
        if mode == "train":
            self.model.train()
            self.model_ema.train()
        else:
            self.model.eval()
            self.model_ema.eval()

    def reduce_lr(self):
        epoch_loss = self.total_epoch_loss  # / float(len(self.dataset) // self.batch_size)
        lossDiff = self.prior_epoch_loss - epoch_loss
        if ((lossDiff < 0.015 and self.prior_epoch_loss < 0.5 and self.lr > 0.00002) or \
            (lossDiff < 0.008 and self.prior_epoch_loss < 0.15 and self.lr > 0.00001) or \
            (lossDiff < 0.003 and self.prior_epoch_loss < 0.10 and self.lr > 0.000005)):
            self.lr *= 0.5
            print("Reduced learning rate to {}".format(self.lr))
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.lr
        self.prior_epoch_loss = epoch_loss
        self.total_epoch_loss = 0

    def save_models(self, iteration):
        save_model(self.model,
                   self.optimizer,
                   iteration,
                   self.model_dir,
                   model_name="model")
        save_model(self.model_ema,
                   None,
                   iteration,
                   self.model_dir,
                   model_name="model_ema")

    def train_epoch(self, epoch):
        cfg = self.cfg
        total_loss = 0.
        total_correct = 0
        total_samples = 0

        self.labeled_data = iter(self.dataloader)
        self.set_mode("train")

        dataset = tqdm(self.labeled_data, total=len(self.dataloader), ncols=20)

        for data in dataset:
            ######################################################
            # (1) Prepare training data
            ######################################################
            image, question, question_len, answer = data['image'], data[
                'question'], data['question_length'], data['answer']
            answer = answer.long()
            question = Variable(question)
            answer = Variable(answer)

            if cfg.CUDA:
                if self.use_feats == 'spatial':
                    image = image.cuda()
                elif self.use_feats == 'objects':
                    image = [e.cuda() for e in image]
                question = question.cuda()
                answer = answer.cuda().squeeze()
            else:
                question = question
                image = image
                answer = answer.squeeze()

            ############################
            # (2) Train Model
            ############################
            self.optimizer.zero_grad()

            scores = self.model(image, question, question_len)
            loss = self.loss_fn(scores, answer)
            loss.backward()

            if self.cfg.TRAIN.CLIP_GRADS:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.cfg.TRAIN.CLIP)

            self.optimizer.step()
            self.weight_moving_average()

            ############################
            # (3) Log Progress
            ############################
            correct = scores.detach().argmax(1) == answer
            total_correct += correct.sum().cpu().item()
            total_loss += loss.item() * answer.size(0)
            total_samples += answer.size(0)

            avg_loss = total_loss / total_samples
            train_accuracy = total_correct / total_samples
            # accuracy = correct.sum().cpu().numpy() / answer.shape[0]

            # if avg_loss == 0:
            #     avg_loss = loss.item()
            #     train_accuracy = accuracy
            # else:
            #     avg_loss = 0.99 * avg_loss + 0.01 * loss.item()
            #     train_accuracy = 0.99 * train_accuracy + 0.01 * accuracy
            # self.total_epoch_loss += loss.item() * answer.size(0)

            dataset.set_description(
                'Epoch: {}; Avg Loss: {:.5f}; Avg Train Acc: {:.5f}'.format(
                    epoch + 1, avg_loss, train_accuracy))

        self.total_epoch_loss = avg_loss

        dict = {
            "loss": avg_loss,
            "accuracy": train_accuracy,
            "avg_loss": avg_loss,  # For commet
            "avg_accuracy": train_accuracy,  # For commet
        }
        return dict

    def train(self):
        cfg = self.cfg
        print("Start Training")
        for epoch in range(self.start_epoch, self.max_epochs):

            with self.comet_exp.train():
                dict = self.train_epoch(epoch)
                self.reduce_lr()
                dict['epoch'] = epoch + 1
                dict['lr'] = self.lr
                self.comet_exp.log_metrics(
                    dict,
                    epoch=epoch + 1,
                )

            with self.comet_exp.validate():
                dict = self.log_results(epoch, dict)
                dict['epoch'] = epoch + 1
                dict['lr'] = self.lr
                self.comet_exp.log_metrics(
                    dict,
                    epoch=epoch + 1,
                )

            if cfg.TRAIN.EALRY_STOPPING:
                if epoch - cfg.TRAIN.PATIENCE == self.previous_best_epoch:
                    # if epoch - cfg.TRAIN.PATIENCE == self.previous_best_loss_epoch:
                    print('Early stop')
                    break

        self.comet_exp.log_asset(self.logfile)
        self.save_models(self.max_epochs)
        self.writer.close()
        print("Finished Training")
        print(
            f"Highest validation accuracy: {self.previous_best_acc} at epoch {self.previous_best_epoch}"
        )

    def log_results(self, epoch, dict, max_eval_samples=None):
        epoch += 1
        self.writer.add_scalar("avg_loss", dict["loss"], epoch)
        self.writer.add_scalar("train_accuracy", dict["accuracy"], epoch)

        metrics = self.calc_accuracy("validation",
                                     max_samples=max_eval_samples)
        self.writer.add_scalar("val_accuracy_ema", metrics['acc_ema'], epoch)
        self.writer.add_scalar("val_accuracy", metrics['acc'], epoch)
        self.writer.add_scalar("val_loss_ema", metrics['loss_ema'], epoch)
        self.writer.add_scalar("val_loss", metrics['loss'], epoch)

        print(
            "Epoch: {epoch}\tVal Acc: {acc},\tVal Acc EMA: {acc_ema},\tAvg Loss: {loss},\tAvg Loss EMA: {loss_ema},\tLR: {lr}"
            .format(epoch=epoch, lr=self.lr, **metrics))

        if metrics['acc'] > self.previous_best_acc:
            self.previous_best_acc = metrics['acc']
            self.previous_best_epoch = epoch
        if metrics['loss'] < self.previous_best_loss:
            self.previous_best_loss = metrics['loss']
            self.previous_best_loss_epoch = epoch

        if epoch % self.snapshot_interval == 0:
            self.save_models(epoch)

        return metrics

    def calc_accuracy(self, mode="train", max_samples=None):
        self.set_mode("validation")

        if mode == "train":
            loader = self.dataloader
        # elif (mode == "validation") or (mode == 'test'):
        #     loader = self.dataloader_val
        else:
            loader = self.dataloader_val

        total_correct = 0
        total_correct_ema = 0
        total_samples = 0
        total_loss = 0.
        total_loss_ema = 0.
        pbar = tqdm(loader, total=len(loader), desc=mode.upper(), ncols=20)
        for data in pbar:

            image, question, question_len, answer = data['image'], data[
                'question'], data['question_length'], data['answer']
            answer = answer.long()
            question = Variable(question)
            answer = Variable(answer)

            if self.cfg.CUDA:
                if self.use_feats == 'spatial':
                    image = image.cuda()
                elif self.use_feats == 'objects':
                    image = [e.cuda() for e in image]
                question = question.cuda()
                answer = answer.cuda().squeeze()

            with torch.no_grad():
                scores = self.model(image, question, question_len)
                scores_ema = self.model_ema(image, question, question_len)

                loss = self.loss_fn(scores, answer)
                loss_ema = self.loss_fn(scores_ema, answer)

            correct = scores.detach().argmax(1) == answer
            correct_ema = scores_ema.detach().argmax(1) == answer

            total_correct += correct.sum().cpu().item()
            total_correct_ema += correct_ema.sum().cpu().item()

            total_loss += loss.item() * answer.size(0)
            total_loss_ema += loss_ema.item() * answer.size(0)

            total_samples += answer.size(0)

            avg_acc = total_correct / total_samples
            avg_acc_ema = total_correct_ema / total_samples
            avg_loss = total_loss / total_samples
            avg_loss_ema = total_loss_ema / total_samples

            pbar.set_postfix({
                'Acc': f'{avg_acc:.5f}',
                'Acc Ema': f'{avg_acc_ema:.5f}',
                'Loss': f'{avg_loss:.5f}',
                'Loss Ema': f'{avg_loss_ema:.5f}',
            })

        return dict(acc=avg_acc,
                    acc_ema=avg_acc_ema,
                    loss=avg_loss,
                    loss_ema=avg_loss_ema)
Beispiel #22
0
def main(cfg: DictConfig):
    print('Cassava Leaf Disease Classification')
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    # Config  -------------------------------------------------------------------
    data_dir = './input'
    seed_everything(cfg.data.seed)

    # Comet_ml
    experiment = Experiment(api_key=cfg.comet_ml.api_key,
                            project_name=cfg.comet_ml.project_name,
                            auto_param_logging=False,
                            auto_metric_logging=False)

    # Log Parameters
    experiment.log_parameters(dict(cfg.data))
    experiment.log_parameters(dict(cfg.train))

    # Data Module  ---------------------------------------------------------------
    transform = get_transforms(transform_name=cfg.data.transform,
                               img_size=cfg.data.img_size)
    cv = StratifiedKFold(n_splits=cfg.data.n_splits,
                         shuffle=True,
                         random_state=cfg.data.seed)
    dm = CassavaDataModule(data_dir,
                           cfg,
                           transform,
                           cv,
                           use_merge=True,
                           sample=DEBUG)

    # Model  ----------------------------------------------------------------------
    net = Timm_model(cfg.train.model_type, pretrained=True)

    # Log Model Graph
    experiment.set_model_graph(str(net))

    # Loss fn  ---------------------------------------------------------------------
    df = pd.read_csv('./input/merged.csv')
    weight = df['label'].value_counts().sort_index().tolist()
    weight = [w / len(df) for w in weight]
    weight = torch.tensor(weight).cuda()
    del df

    criterion = get_loss_fn(cfg.train.loss_fn, weight=weight, smoothing=0.05)

    # Optimizer, Scheduler  --------------------------------------------------------
    if cfg.train.use_sam:
        base_optimizer = RAdam
        optimizer = SAM(net.parameters(),
                        base_optimizer,
                        lr=cfg.train.lr,
                        weight_decay=cfg.train.weight_decay)
    else:
        optimizer = RAdam(net.parameters(),
                          lr=cfg.train.lr,
                          weight_decay=cfg.train.weight_decay)

    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=cfg.train.epoch,
                                               eta_min=0)

    # Lightning Module  -------------------------------------------------------------
    model = CassavaLightningSystem(net,
                                   cfg,
                                   criterion=criterion,
                                   optimizer=optimizer,
                                   scheduler=scheduler,
                                   experiment=experiment)

    # Trainer  -------------------------------------------------------------------------
    trainer = Trainer(
        logger=False,
        max_epochs=cfg.train.epoch,
        gpus=-1,
        amp_backend='apex',
        amp_level='O2',
        num_sanity_val_steps=0,  # Skip Sanity Check
        automatic_optimization=False if cfg.train.use_sam else True,
        # resume_from_checkpoint='./checkpoints/epoch=3-step=14047.ckpt'
    )

    # Train
    trainer.fit(model, datamodule=dm)