Example #1
0
            d_loss.backward()
            d_opt.step()

            # 生成器训练
            z = torch.randn(imgData.size(0), DDN_SIZE, 1, 1).to(device)
            fake_img_1 = g_net(z)
            output = d_net(fake_img_1)
            g_loss = loss_fn(output, real_label)

            g_opt.zero_grad()
            g_loss.backward()
            g_opt.step()

            for name, value in d_net.named_parameters():
                # print('name: {0},\t grad: {1}'.format(name, value.grad))
                writer.add_histogram(name, value, epoch)

            if i % 30 == 0:
                real_score = real_score.cpu().data.mean()
                fake_score = fake_score.cpu().data.mean()
                print(
                    "Epoch:[{}/{}],d_loss:{:.3f},"
                    "g_loss:{:.3f},real_score:{:.3f},fake_score:{:.3f}".format(
                        i, epoch, d_loss, g_loss, real_score, fake_score))

                fake_img = fake_img.cpu().data

                save_image(fake_img,
                           "./IMG/{}-fake.png".format(epoch),
                           nrow=10,
                           normalize=True,
Example #2
0
    def deepinversion_improved(self, use_generator      = False, \
                                     discrete_label     = True,  \
                                     noisify_network    = 0.0, \
                                     knowledge_distill  = 0.0, \
                                     mutual_info        = 0.0, \
                                     batchnorm_transfer = 0.0, \
                                     use_discriminator  = 0.0, \
                                     n_iters = 100):
        tb = SummaryWriter()
        if use_generator == True:
            z = torch.randn((self.n_samples, self.latent_dim),
                            requires_grad=False,
                            device=self.device,
                            dtype=torch.float)
            if discrete_label == True:
                y_gt = torch.randint(0,
                                     2, (self.n_samples, self.label_dim),
                                     dtype=torch.float,
                                     device=self.device)
            else:
                y_gt = torch.cuda.FloatTensor(self.n_samples,
                                              self.label_dim).uniform_(0, 1)
            x = self.net_gen(z, y_gt)
            if mutual_info > 0.0:
                ''' declare the optimizer for the encoder network '''
                optimizer = torch.optim.Adam(list(self.net_gen.parameters()) +
                                             list(self.net_enc.parameters()),
                                             lr=self.lr)
            else:
                optimizer = torch.optim.Adam(self.net_gen.parameters(),
                                             lr=self.lr)
        else:
            x = torch.randn((self.n_samples, 2),
                            requires_grad=True,
                            device=self.device,
                            dtype=torch.float)
            if discrete_label == True:
                y_gt = torch.randint(0,
                                     2, (self.n_samples, self.label_dim),
                                     dtype=torch.float,
                                     device=self.device)
            else:
                y_gt = torch.cuda.FloatTensor(self.n_samples,
                                              self.label_dim).uniform_(0, 1)
            optimizer = torch.optim.Adam([x], lr=self.lr)

        #update name of output
        self.imgname = self.imgname + "_gen%d" % (use_generator)
        ''' declare the optimizer for the student network '''
        optimizer_std = torch.optim.Adam(self.net_std.parameters(),
                                         lr=self.classifier_lr)

        if self.device == 'cuda':
            x_np = x.cpu().detach().clone().numpy()
        else:
            x_np = x.detach().clone().numpy()

        fig, ax = self.setup_plot_progress(x_np)

        total_loss = []

        # set for testing with batchnorm
        self.net.eval()

        ## Create hooks for feature statistics
        loss_bn_feature_layers = []
        if use_generator == True and use_discriminator > 0.0:
            nets_dis = []
            nets_dis_params = []

        for module in self.net.modules():
            if isinstance(module, nn.BatchNorm1d):
                loss_bn_feature_layers.append(bn1dfeathook(module))
                if use_generator == True and use_discriminator > 0.0:
                    net_dis = netdis(module.running_mean.shape[0],
                                     self.n_hidden, 1).cuda()
                    net_dis.apply(weights_init)
                    nets_dis.append(net_dis)
                    nets_dis_params += list(net_dis.parameters())

        if use_generator == True and use_discriminator > 0.0:
            self.optimizer_dis = torch.optim.Adam(nets_dis_params,
                                                  lr=self.lr,
                                                  betas=(0.5, 0.9))

        ## Create hooks for feature statistics for generator
        if use_generator == True and batchnorm_transfer > 0.0:
            loss_bn_feature_layers_gen = []
            self.compute_loss_bn_gen(loss_bn_feature_layers_gen)

        for it in range(n_iters):
            self.net.zero_grad()
            self.net_gen.zero_grad()
            self.net_std.zero_grad()
            self.net_enc.zero_grad()
            optimizer.zero_grad()
            optimizer_std.zero_grad()

            if use_generator == True:
                ''' randomly sampling latent and labels '''
                z = torch.randn((self.n_samples, self.latent_dim),
                                requires_grad=False,
                                device=self.device,
                                dtype=torch.float)
                y_gt = torch.randint(0,
                                     2, (self.n_samples, self.label_dim),
                                     dtype=torch.float,
                                     device=self.device)

            if use_generator == True:
                ''' generating samples with generator '''
                x = self.net_gen(z, y_gt)
            '''
            **********************************************************************
            To optimize the generated samples or training the generator
            **********************************************************************
            '''
            if noisify_network > 0.0:
                ''' adding noise into the pre-trained classifier '''
                weight = noisify_network * (n_iters - it) / n_iters
                self.net, orig_params = add_noise_to_net(self.net,
                                                         weight=weight,
                                                         noise_type='uniform')

            if it == 0:
                self.imgname = self.imgname + "_nosify%0.3f" % (
                    noisify_network)

            y_pd = self.net(x)
            ''' main loss (cross-entropy loss) '''
            loss_main = self.loss_func(y_pd, y_gt)
            ''' l2 regularization '''
            loss_l2 = torch.norm(x.view(-1, self.n_input_dim), dim=1).mean()
            ''' batch-norm regularization '''
            rescale = [1. for _ in range(len(loss_bn_feature_layers))]
            loss_bn = sum([
                mod.r_feature * rescale[idx]
                for (idx, mod) in enumerate(loss_bn_feature_layers)
            ])
            ''' total loss '''
            if use_generator == True and use_discriminator > 0.0:
                bn_w = 0.05
            else:
                bn_w = 1.0

            loss = loss_main + 0.005 * loss_l2 + bn_w * loss_bn

            if knowledge_distill > 0.0:
                ''' knowledge distillation (teacher-student) based regularization '''
                y_st = self.net_std(x)
                #loss_kd = 1 - self.loss_func(y_st, y_pd.detach())
                loss_kd = knowledge_distill_loss(y_pd.detach(), y_st)
                loss = loss + knowledge_distill * loss_kd

            if it == 0:
                self.imgname = self.imgname + "_kdistill%0.3f" % (
                    knowledge_distill)

            if mutual_info > 0.0:
                ''' mutual information constraint '''
                ze = self.net_enc(x)
                loss_mi = ((z - ze)**2).mean()

                zdiv = torch.randn((self.n_samples, self.latent_dim),
                                   requires_grad=False,
                                   device=self.device,
                                   dtype=torch.float)
                xdiv = self.net_gen(zdiv, y_gt)
                loss_div = diveristy_loss(z, x, zdiv, xdiv)

                loss = loss + mutual_info * loss_mi + 0.1 * mutual_info * loss_div

            if it == 0:
                self.imgname = self.imgname + "_minfo%0.3f" % (mutual_info)

            if use_generator == True and batchnorm_transfer > 0.0:
                ''' batch-norm transfer loss '''
                rescale_gen = [
                    1. for _ in range(len(loss_bn_feature_layers_gen))
                ]
                loss_bn_gen = sum([
                    mod.r_feature * rescale_gen[idx]
                    for (idx, mod) in enumerate(loss_bn_feature_layers_gen)
                ])
                loss = loss + batchnorm_transfer * loss_bn_gen

            if it == 0:
                self.imgname = self.imgname + "_btransfer%0.3f" % (
                    batchnorm_transfer)

            if use_generator == True and use_discriminator > 0.0:
                # train the generator on features
                loss_g = 0
                # traing the generator on features
                for (idx, mod) in enumerate(loss_bn_feature_layers):
                    nets_dis[idx].zero_grad()
                    # frozen the gradient for the discriminator
                    for p in nets_dis[idx].parameters():
                        p.requires_grad = False  # to avoid computation
                    feat_fake = mod.feat_fake.cuda()
                    d_fake = nets_dis[idx](feat_fake)
                    loss_g = loss_g - d_fake.mean()
                loss = loss + use_discriminator * loss_g

            if use_generator == True and it == 0:
                self.imgname = self.imgname + "_discriminator%0.3f" % (
                    use_discriminator)

            loss.backward()
            optimizer.step()

            if it % 100 == 0:
                tb.add_scalar("Total loss: ", loss, it)
                tb.add_scalar("Loss batchnorm", loss_bn, it)
                tb.add_histogram("Input", x, it)
                # tb.add_histogram("Input/gradients", x.grad, it)
                for name, param in self.net_gen.named_parameters():
                    tb.add_histogram(name, param.data, it)
                    tb.add_histogram(name + "/gradients", param.grad, it)

            if noisify_network > 0.0:
                ''' reset the network's parameters '''
                reset_params(self.net, orig_params)

            if knowledge_distill > 0.0:
                '''
               **********************************************************************
               To update the student network
               **********************************************************************
               '''
                if use_generator == True:
                    ''' generating samples with generator '''
                    x = self.net_gen(z, y_gt)

                y_pd = self.net(x)
                y_st = self.net_std(x)
                #loss_kd = self.loss_func(y_st, y_pd.detach())
                loss_kd = 1. - knowledge_distill_loss(y_pd.detach(), y_st)
                loss_kd.backward()
                optimizer_std.step()
            ''' store the main loss to plot on the figure '''
            total_loss.append(loss.item())

            if use_generator == True and use_discriminator > 0.0:
                # traing the discriminator on features
                for _ in range(5):
                    loss_d = 0
                    x = self.net_gen(z, y_gt)
                    self.net(x)
                    for (idx, mod) in enumerate(loss_bn_feature_layers):
                        nets_dis[idx].zero_grad()
                        for p in nets_dis[idx].parameters(
                        ):  # reset requires_grad
                            p.requires_grad = True
                        feat_real = mod.feat_real.cuda()
                        feat_fake = mod.feat_fake.cuda()
                        d_real = nets_dis[idx](feat_real)
                        d_fake = nets_dis[idx](feat_fake)
                        penalty = calc_gradient_penalty(nets_dis[idx],
                                                        feat_real,
                                                        feat_fake,
                                                        LAMBDA=1.0)
                        loss_d = loss_d + use_discriminator * (
                            d_fake.mean() - d_real.mean() + penalty)
                    loss_d.backward()
                    self.optimizer_dis.step()

            if it % 10 == 0:
                print('-- iter %d --' % (it))
                print('target loss: %f' % (loss_main.item()))
                print('l2-norm loss: %f' % (loss_l2.item()))
                print('batchnorm loss: %f' % (loss_bn.item()))
                if knowledge_distill > 0.0:
                    print('distillation loss: %f' % (loss_bn.item()))
                if mutual_info > 0.0:
                    print('mutual information / diversity losses: %f / %f' %
                          (loss_mi.item(), loss_div.item()))
                if batchnorm_transfer > 0.0:
                    print('batch-norm transfer loss: %f ' %
                          (loss_bn_gen.item()))
                if use_generator == True and use_discriminator > 0.0:
                    print('loss d / loss g: %f / %f' %
                          (loss_d.item(), loss_g.item()))
                print('total loss: %f' % (loss.item()))
                ''' realtime plot '''
                ax[0].plot(total_loss, c='b')
                fig.canvas.draw()

        if self.device == 'cuda':
            x_np = x.cpu().detach().numpy()
        else:
            x_np = x.detach().numpy()
        tb.close()
        ax[1].scatter(x_np[:, 0], x_np[:, 1], c='b', cmap=plt.cm.Accent)
        plt.savefig(self.basedir + "%s.png" % (self.imgname))
        plt.show()
Example #3
0
def main(args):
    if not os.path.exists(args.conf):
        print('Config not found' + args.conf)

    config = read_config(args.conf)

    print('Initializing parameters')
    template_file_path = config['template_fname']
    template_mesh = Mesh(filename=template_file_path)

    if args.checkpoint_dir:
        checkpoint_dir = args.checkpoint_dir
    else:
        checkpoint_dir = config['checkpoint_dir']
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    visualize = config['visualize']
    output_dir = config['visual_output_dir']
    if visualize is True and not output_dir:
        print(
            'No visual output directory is provided. Checkpoint directory will be used to store the visual results'
        )
        output_dir = checkpoint_dir

    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    eval_flag = config['eval']
    lr = config['learning_rate']
    lr_decay = config['learning_rate_decay']
    weight_decay = config['weight_decay']
    total_epochs = config['epoch']
    workers_thread = config['workers_thread']
    opt = config['optimizer']
    batch_size = config['batch_size']
    val_losses, accs, durations = [], [], []

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print('Generating transforms')
    M, A, D, U = mesh_operations.generate_transform_matrices(
        template_mesh, config['downsampling_factors'])

    D_t = [scipy_to_torch_sparse(d).to(device) for d in D]
    U_t = [scipy_to_torch_sparse(u).to(device) for u in U]
    A_t = [scipy_to_torch_sparse(a).to(device) for a in A]
    num_nodes = [len(M[i].v) for i in range(len(M))]

    print('Loading Dataset')
    if args.data_dir:
        data_dir = args.data_dir
    else:
        data_dir = config['data_dir']

    normalize_transform = Normalize()
    dataset = ComaDataset(data_dir,
                          dtype='train',
                          split=args.split,
                          split_term=args.split_term,
                          pre_transform=normalize_transform)
    dataset_val = ComaDataset(data_dir,
                              dtype='val',
                              split=args.split,
                              split_term=args.split_term,
                              pre_transform=normalize_transform)
    dataset_test = ComaDataset(data_dir,
                               dtype='test',
                               split=args.split,
                               split_term=args.split_term,
                               pre_transform=normalize_transform)
    train_loader = DataLoader(dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=workers_thread)
    val_loader = DataLoader(dataset_val,
                            batch_size=1,
                            shuffle=True,
                            num_workers=workers_thread)
    test_loader = DataLoader(dataset_test,
                             batch_size=1,
                             shuffle=False,
                             num_workers=workers_thread)

    print('Loading model')
    start_epoch = 1
    coma = Coma(dataset, config, D_t, U_t, A_t, num_nodes)
    if opt == 'adam':
        optimizer = torch.optim.Adam(coma.parameters(),
                                     lr=lr,
                                     weight_decay=weight_decay)
    elif opt == 'sgd':
        optimizer = torch.optim.SGD(coma.parameters(),
                                    lr=lr,
                                    weight_decay=weight_decay,
                                    momentum=0.9)
    else:
        raise Exception('No optimizer provided')

    checkpoint_file = config['checkpoint_file']
    print(checkpoint_file)
    if checkpoint_file:
        checkpoint = torch.load(checkpoint_file)
        start_epoch = checkpoint['epoch_num']
        coma.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        #To find if this is fixed in pytorch
        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(device)
    coma.to(device)

    if eval_flag:
        val_loss = evaluate(coma, output_dir, test_loader, dataset_test,
                            template_mesh, device, visualize)
        print('val loss', val_loss)
        return

    best_val_loss = float('inf')
    val_loss_history = []

    from datetime import datetime
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    log_dir = os.path.join('runs/cvae_dx', current_time)
    writer = SummaryWriter(log_dir + 'ds2_lr0.04_z2_kld_v5')
    print(coma.z)

    for epoch in range(start_epoch, total_epochs + 1):
        print("Training for epoch ", epoch)
        recon_loss, kld_loss, mu, var, kld_weight = train(
            coma, train_loader, len(dataset), optimizer, device, epoch)
        val_loss = evaluate(coma,
                            output_dir,
                            val_loader,
                            dataset_val,
                            template_mesh,
                            device,
                            visualize=visualize)
        train_loss = recon_loss + kld_loss
        writer.add_scalar('loss/train_loss', recon_loss + kld_loss, epoch)
        writer.add_scalar('train_loss/recon_loss', recon_loss, epoch)
        writer.add_scalar('train_loss/kld_loss', kld_loss, epoch)
        writer.add_scalar('loss/val_loss', val_loss, epoch)
        writer.add_histogram('hist/mean', mu, epoch)
        writer.add_histogram('hist/variance', var, epoch)

        print('epoch ', epoch, ' Recon loss ', recon_loss, ' KLD loss ',
              kld_loss, ' Val loss ', val_loss)
        print('kld weight ', kld_weight)
        if val_loss < best_val_loss:
            save_model(coma, optimizer, epoch, train_loss, val_loss,
                       checkpoint_dir)
            best_val_loss = val_loss

        if epoch == total_epochs or epoch % 100 == 0:
            save_model(coma, optimizer, epoch, train_loss, val_loss,
                       checkpoint_dir)

        val_loss_history.append(val_loss)
        val_losses.append(best_val_loss)

        if opt == 'sgd':
            adjust_learning_rate(optimizer, lr_decay)

    if torch.cuda.is_available():
        torch.cuda.synchronize()

    writer.close()
Example #4
0
    def train(self):

        if self.resume:
            print('Resuming training ...')
            checkpoint = torch.load(os.path.join(self.log_root, 'torch_model'))
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        else:
            print('Starting training ...')

        writer = SummaryWriter(self.log_root)
        self.model = self.model.to(self.device)

        epoch = int(self.model.epoch) + 1
        it = int(self.model.iteration)
        for epoch in range(epoch, epoch + self.num_epoch):

            epoch_root = 'epoch_{:02d}'.format(epoch)
            if not os.path.exists(os.path.join(self.log_root, epoch_root)):
                os.makedirs(os.path.join(self.log_root, epoch_root))

            for phase in self.data_loaders.keys():
                epoch_loss = 0

                if phase == 'train':
                    self.model.train(True)
                else:
                    self.model.train(False)

                running_loss = 0.0
                for i, (data, index) in enumerate(self.data_loaders[phase]):
                    it += 1
                    # copy input and targets to the device object
                    inputs = data['input'].to(self.device)
                    targets = data['target'].to(self.device)
                    # zero the parameter gradients
                    self.optimizer.zero_grad()

                    # forward + backward + optimize
                    outputs = self.model(inputs)
                    loss = self.criterion(outputs, targets)

                    if phase == 'train':
                        loss.backward()
                        self.optimizer.step()

                    # print statistics
                    running_loss += loss.item()
                    epoch_loss += loss.item()
                    if (i + 1) % self.log_int == 0:
                        running_loss_avg = running_loss / self.log_int
                        print('Phase: ' + phase +
                              ', epoch: {}, batch {}: running loss: {:0.3f}'.
                              format(self.model.epoch, i +
                                     1, running_loss_avg))
                        writer.add_scalars('running_loss',
                                           {phase: running_loss_avg}, it)
                        running_loss = 0.0

                if phase in ['train', 'val']:
                    epoch_loss_avg = epoch_loss / self.data_lengths[phase]
                    print('Phase: ' + phase +
                          ', epoch: {}: epoch loss: {:0.3f}'.format(
                              epoch, epoch_loss_avg))
                    writer.add_scalars('epoch_loss', {phase: epoch_loss_avg},
                                       epoch)
                    writer.add_histogram(
                        'input histogram',
                        inputs.cpu().data.numpy()[0, 0].flatten(), epoch)
                    writer.add_histogram(
                        'output histogram',
                        outputs.cpu().data.numpy()[0, 0].flatten(), epoch)
                    figure_inds = list(range(inputs.shape[0]))
                    figure_inds = figure_inds if len(
                        figure_inds) < 4 else list(range(4))
                    fig = Trainer.show_imgs(inputs, outputs, figure_inds)
                    fig.savefig(
                        os.path.join(self.log_root, epoch_root,
                                     phase + '.png'))
                    writer.add_figure('images ' + phase, fig, epoch)

                if self.save & (phase == 'train'):

                    print('Writing model graph...')
                    writer.add_graph(self.model, inputs)

                    print('Saving model state...')
                    self.model.epoch = torch.nn.Parameter(torch.tensor(epoch),
                                                          requires_grad=False)
                    self.model.iteration = torch.nn.Parameter(
                        torch.tensor(it), requires_grad=False)
                    torch.save({
                        'model_state_dict': self.model.state_dict(),
                    },
                               os.path.join(self.log_root, epoch_root,
                                            'model_state_dict'))
                    torch.save(
                        {'optimizer_state_dict': self.optimizer.state_dict()},
                        os.path.join(self.log_root, 'optimizer_state_dict'))

        print('Finished training ...')
        writer.close()
        print('Writer closed ...')

        # dictionary of accuracy metrics for tune hyperparameter optimization
        return {"val_loss_avg": epoch_loss_avg}
Example #5
0
def train_net(net,
              device,
              epochs=5,
              batch_size=1,
              lr=0.001,
              val_percent=0.1,
              save_cp=True,
              img_scale=0.5):

    dataset = BasicDataset(dir_img, dir_mask, img_scale)
    n_val = int(len(dataset) * val_percent)
    n_train = len(dataset) - n_val
    train, val = random_split(dataset, [n_train, n_val])
    train_loader = DataLoader(train,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True)
    val_loader = DataLoader(val,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=8,
                            pin_memory=True,
                            drop_last=True)

    writer = SummaryWriter(
        comment=f'LR_{lr}_BS_{batch_size}_SCALE_{img_scale}')
    global_step = 0

    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {batch_size}
        Learning rate:   {lr}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images scaling:  {img_scale}
    ''')

    optimizer = optim.RMSprop(net.parameters(),
                              lr=lr,
                              weight_decay=1e-8,
                              momentum=0.9)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 'min' if net.n_classes > 1 else 'max', patience=2)
    if net.n_classes > 1:
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        net.train()

        epoch_loss = 0
        with tqdm(total=n_train,
                  desc=f'Epoch {epoch + 1}/{epochs}',
                  unit='img') as pbar:
            for batch in train_loader:
                imgs = batch['image']
                true_masks = batch['mask']
                assert imgs.shape[1] == net.n_channels, \
                    f'Network has been defined with {net.n_channels} input channels, ' \
                    f'but loaded images have {imgs.shape[1]} channels. Please check that ' \
                    'the images are loaded correctly.'

                imgs = imgs.to(device=device, dtype=torch.float32)
                mask_type = torch.float32 if net.n_classes == 1 else torch.long
                true_masks = true_masks.to(device=device, dtype=mask_type)

                masks_pred = net(imgs)
                print(masks_pred.shape, true_masks.shape)
                loss = criterion(masks_pred, true_masks)
                epoch_loss += loss.item()
                writer.add_scalar('Loss/train', loss.item(), global_step)

                pbar.set_postfix(**{'loss (batch)': loss.item()})

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_value_(net.parameters(), 0.1)
                optimizer.step()

                pbar.update(imgs.shape[0])
                global_step += 1
                if global_step % (n_train // (10 * batch_size)) == 0:
                    for tag, value in net.named_parameters():
                        tag = tag.replace('.', '/')
                        writer.add_histogram('weights/' + tag,
                                             value.data.cpu().numpy(),
                                             global_step)
                        writer.add_histogram('grads/' + tag,
                                             value.grad.data.cpu().numpy(),
                                             global_step)
                    val_score = eval_net(net, val_loader, device)
                    scheduler.step(val_score)
                    writer.add_scalar('learning_rate',
                                      optimizer.param_groups[0]['lr'],
                                      global_step)

                    if net.n_classes > 1:
                        logging.info(
                            'Validation cross entropy: {}'.format(val_score))
                        writer.add_scalar('Loss/test', val_score, global_step)
                    else:
                        logging.info(
                            'Validation Dice Coeff: {}'.format(val_score))
                        writer.add_scalar('Dice/test', val_score, global_step)

                    writer.add_images('images', imgs, global_step)
                    if net.n_classes == 1:
                        writer.add_images('masks/true', true_masks,
                                          global_step)
                        writer.add_images('masks/pred',
                                          torch.sigmoid(masks_pred) > 0.5,
                                          global_step)

        if save_cp:
            try:
                os.mkdir(dir_checkpoint)
                logging.info('Created checkpoint directory')
            except OSError:
                pass
            torch.save(net.state_dict(),
                       dir_checkpoint + f'CP_epoch{epoch + 1}.pth')
            logging.info(f'Checkpoint {epoch + 1} saved !')

    writer.close()
Example #6
0
            "xsinx": x * np.sin(x),
            "xcosx": x * np.cos(x)
        }, x)

    writer.close()

# ----------------------------------- 2 histogram -----------------------------------
# flag = 0
flag = 1
if flag:

    writer = SummaryWriter(comment='test_comment',
                           filename_suffix="test_suffix")

    for x in range(2):

        np.random.seed(x)

        data_union = np.arange(100)
        data_normal = np.random.normal(size=1000)

        writer.add_histogram('distribution union', data_union, x)
        writer.add_histogram('distribution normal', data_normal, x)

        plt.subplot(121).hist(data_union, label="union")
        plt.subplot(122).hist(data_normal, label="normal")
        plt.legend()
        plt.show()

    writer.close()
Example #7
0
        elif opt.prune == 1:
            CBL_idx, _, prune_idx, shortcut_idx, _ = parse_moudle_defs1(
                model.module_defs)  # TODO 剪枝策略3
            print('shortcut sparse training')

    # tensorboard
    tb_writer = SummaryWriter()

    for epoch in range(opt.epochs):
        model.train()
        if opt.sr:
            # TODO bn可视化
            for idx in prune_idx:
                bn_weights = gather_bn_weights(model.module_list, [idx])
                tb_writer.add_histogram('bn_weight/hist',
                                        bn_weights.numpy(),
                                        epoch,
                                        bins='doane')

        start_time = time.time()
        for batch_i, (paths, imgs, targets) in enumerate(dataloader):
            batches_done = len(dataloader) * epoch + batch_i

            # TODO plot images  100次保存一次结果
            if batches_done == 0:
                fname = 'train_batch%g.jpg' % batch_i
                plot_images(imgs=imgs,
                            targets=targets,
                            paths=paths,
                            fname=fname)
                tb_writer.add_image(fname,
                                    cv2.imread(fname)[:, :, ::-1],
Example #8
0
def main():
    """ Train and test

    :param opt: args
    :param writer: tensorboard
    :return:
    """

    global opt
    opt = parse()

    arc = opt.arc
    cfg = opt.cfg
    teacher_cfg = opt.teacher_cfg
    img_size = opt.img_size
    epochs = opt.epochs
    batch_size = opt.batch_size
    accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
    weights = opt.weights
    teacher_weights = opt.teacher_weights
    multi_scale = opt.multi_scale
    sparsity_training = opt.st

    opt.weights = last if opt.resume else opt.weights

    # Initial logging
    logging.basicConfig(
        format="%(message)s",
        level=logging.INFO if opt.local_rank in [-1, 0] else logging.WARN)

    # Train
    logger.info(opt)
    if opt.local_rank in [-1, 0]:
        logger.info('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
        writer = SummaryWriter()

    # Hyperparameters
    with open(opt.hyp) as f_hyp:
        hyp = yaml.safe_load(f_hyp)
    # data dict
    with open(opt.data) as f_data:
        data = yaml.safe_load(f_data)

    # Distributed training initialize
    device = select_device(opt.device)
    if opt.local_rank != -1:
        dist.init_process_group(init_method="env://", backend='nccl')
        torch.cuda.set_device(opt.local_rank)
        device = torch.device(f"cuda:{opt.local_rank}")
        # world_size = torch.distributed.get_world_size()

    init_seeds()
    cuda = device.type != 'cpu'
    torch.backends.cudnn.benchmark = True

    if multi_scale:
        img_size_min = round(img_size / 32 / 1.5) + 1
        img_size_max = round(img_size / 32 * 1.5) - 1
        img_size = img_size_max * 32  # initiate with maximum multi_scale size
        logger.info(f'Using multi-scale  {img_size_min * 32} - {img_size}')

    train_path = data['train']
    num_classes = int(data['num_classes'])  # number of classes

    # Load dataset
    dataset = LoadImagesAndLabels(train_path,
                                  img_size,
                                  batch_size,
                                  augment=True,
                                  hyp=hyp,
                                  rect=opt.rect)
    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if opt.local_rank != -1 else None
    num_worker = os.cpu_count() // torch.cuda.device_count()
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             num_workers=min([num_worker, batch_size, 8]),
                                             shuffle=not (opt.rect or train_sampler),
                                             sampler=train_sampler,
                                             pin_memory=True,
                                             collate_fn=dataset.collate_fn)

    # Load model
    model = Model(cfg, img_size, arc=arc).to(device)

    # Load teacher model
    if teacher_cfg:
        teacher_model = Model(teacher_cfg, img_size, arc).to(device)

    # optimizer parameter groups
    param_group0, param_group1 = [], []
    for key, value in model.named_parameters():
        if 'Conv2d.weight' in key:
            param_group1.append(value)
        else:
            param_group0.append(value)
    if opt.adam:
        optimizer = optim.Adam(param_group0, lr=hyp['lr0'])
    else:
        optimizer = optim.SGD(param_group0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
    # add param_group1 with weight_decay
    optimizer.add_param_group({'params': param_group1, 'weight_decay': hyp['weight_decay']})
    logger.info(f'Optimizer groups: {len(param_group1)} conv.weight, {len(param_group0)} other')
    del param_group0, param_group1

    start_epoch = 0
    best_fitness = 0.
    if weights.endswith('.pt'):
        checkpoint = torch.load(weights, map_location=device)
        state_dict = intersect_dicts(checkpoint['model'], model.state_dict())
        model.load_state_dict(state_dict, strict=False)
        print('loaded weights from', weights, '\n')

        # load optimizer
        if checkpoint['optimizer'] is not None:
            optimizer.load_state_dict(checkpoint['optimizer'])
            best_fitness = checkpoint['best_fitness']
        # load results
        if checkpoint.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(checkpoint['training_results'])
        # resume
        if opt.resume:
            start_epoch = checkpoint['epoch'] + 1
        del checkpoint

    elif len(weights) > 0:
        # weights are 'yolov4.weights', 'darknet53.conv.74' etc.
        load_darknet_weights(model, weights)
        logger.info(f'loaded weights from {weights}\n')

    # Load teacher weights
    if teacher_cfg:
        if teacher_weights.endswith('.pt'):
            teacher_model.load_state_dict(torch.load(teacher_weights, map_location=device)['model'])
        elif teacher_weights.endswith('.weights'):
            load_darknet_weights(teacher_model, teacher_weights)
        else:
            raise Exception('pls provide proper teacher weights for knowledge distillation')
        if not mixed_precision:
            teacher_model.eval()
        logger.info('<......................using knowledge distillation....................>')
        logger.info(f'teacher model: {teacher_weights}\n')

    # Sparsity training
    if opt.prune == 0:
        _, _, prune_index = parse_module_index(model.module_dicts)
        if sparsity_training:
            logger.info('normal sparse training')

    if mixed_precision:
        if teacher_cfg:
            [model, teacher_model], optimizer = amp.initialize([model, teacher_model], optimizer,
                                                               opt_level='O1', verbosity=1)
        else:
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=1)

    # SyncBatchNorm and distributed training
    if cuda and opt.local_rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        model = model.to(device)
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[opt.local_rank])
        model.module_list = model.module.module_list
        model.yolo_layers = model.module.yolo_layers

    for index in prune_index:
        bn_weights = gather_bn_weights(model.module_list, [index])
        if opt.local_rank == 0:
            writer.add_histogram('before_train_per_layer_bn_weights/hist', bn_weights.numpy(), index, bins='doane')

    # Start training
    model.num_classes = num_classes
    model.arc = opt.arc
    model.hyp = hyp
    num_batch_size = len(dataloader)
    # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    results = (0, 0, 0, 0, 0, 0, 0)
    start_train_time = time.time()
    logger.info('Image sizes %d \n Starting training for %d epochs...', img_size, epochs)

    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        model.train()

        mean_losses = torch.zeros(4).to(device)
        mean_soft_target = torch.zeros(1).to(device)
        pbar = enumerate(dataloader)
        logger.info(('\n %10s %10s %10s %10s %10s %10s %10s %10s'), 'Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total',
                    'targets', 'img_size')
        if opt.local_rank in [-1, 0]:
            pbar = tqdm(pbar, total=num_batch_size)
        optimizer.zero_grad()

        for i, (imgs, targets, _, _) in pbar:  # batch -------------------------------------------------------------
            num_integrated_batches = i + num_batch_size * epoch

            # Adjust the learning rate
            learning_rate = adjust_learning_rate(optimizer, num_integrated_batches, num_batch_size, hyp, epoch, epochs)
            if i == 0 and opt.local_rank in [-1, 0]:
                logger.info(f'learning rate: {learning_rate}')
            imgs = imgs.to(device) / 255.0
            targets = targets.to(device)

            # Multi-Scale training
            if multi_scale:
                if num_integrated_batches / accumulate % 10 == 0:
                    img_size = random.randrange(img_size_min, img_size_max + 1) * 32
                scale_factor = img_size / max(imgs.shape[2:])
                if scale_factor != 1:
                    new_shape = [math.ceil(x * scale_factor / 32.) * 32 for x in imgs.shape[2:]]
                    imgs = F.interpolate(imgs, size=new_shape, mode='bilinear', align_corners=False)

            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model)

            # knowledge distillation
            soft_target = 0
            if teacher_cfg:
                if mixed_precision:
                    with torch.no_grad():
                        output_teacher = teacher_model(imgs)
                else:
                    _, output_teacher = teacher_model(imgs)
                soft_target = distillation_loss(pred, output_teacher, model.num_classes, imgs.size(0))
                loss += soft_target

            # Scale loss by nominal batch_size of 64
            loss *= batch_size / 64

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Sparse the BN layer that needs pruning
            if sparsity_training:
                # bn_l1_regularization(model.module_list, opt.penalty_factor, cba_index, epoch, epochs)
                bn_l1_regularization(model.module_list, opt.penalty_factor, prune_index, epoch, epochs)

            # Accumulate gradient for x batches before optimizing
            if num_integrated_batches % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()

            if opt.local_rank in [-1, 0]:
                mean_losses = (mean_losses * i + loss_items) / (i + 1)
                mean_soft_target = (mean_soft_target * i + soft_target) / (i + 1)
                memory = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0  # (GB)
                description = ('%10s' * 2 + '%10.3g' * 6) % (
                    '%g/%g' % (epoch, epochs - 1), '%.3gG' % memory, *mean_losses, mean_soft_target, img_size)
                pbar.set_description(description)

            # end batch ------------------------------------------------------------------------------------------------

        # Update scheduler
        # scheduler.step()

        if opt.local_rank in [-1, 0]:
            final_epoch = epoch + 1 == epochs
            # Calculate mAP
            if not (opt.notest or opt.nosave) or final_epoch:
                with torch.no_grad():
                    results, _ = test(cfg, data,
                                      batch_size=batch_size,
                                      img_size=opt.img_size,
                                      model=model,
                                      conf_thres=0.001 if final_epoch and epoch > 0 else 0.1,  # 0.1 for speed
                                      save_json=final_epoch and epoch > 0)

            # Write epoch results
            with open(results_file, 'a') as file:
                # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
                file.write(description + '%10.3g' * 7 % results + '\n')

            # Write Tensorboard results
            if writer:
                outputs = list(mean_losses) + list(results)
                titles = ['GIoU', 'Objectness', 'Classification', 'Train loss',
                          'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification']
                for output, title in zip(outputs, titles):
                    writer.add_scalar(title, output, epoch)
                bn_weights = gather_bn_weights(model.module_list, prune_index)
                writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane')

            # Update best mAP
            fitness = results[2]
            if fitness > best_fitness:
                best_fitness = fitness

            # Save training results
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save and opt.local_rank == 0:
                with open(results_file, 'r') as file:
                    # Create checkpoint
                    checkpoint = {'epoch': epoch,
                                  'best_fitness': best_fitness,
                                  'training_results': file.read(),
                                  'model': model.module.state_dict() if isinstance(
                                   model, nn.parallel.DistributedDataParallel) else model.state_dict(),
                                  'optimizer': None if final_epoch else optimizer.state_dict()}

                # Save last checkpoint
                torch.save(checkpoint, last)

                # Save best checkpoint
                if best_fitness == fitness:
                    torch.save(checkpoint, best)

                # Delete checkpoint
                del checkpoint

            # end epoch -----------------------------------------------------------------------------------------------
    # end training

    if opt.local_rank in [-1, 0]:
        if len(opt.name):
            os.rename('results.txt', 'results_%s.txt' % opt.name)
        plot_results()  # save as results.png
        print(f'{epoch - start_epoch + 1} epochs completed in {(time.time() - start_train_time) / 3600:.3f} hours.\n')
    if torch.cuda.device_count() > 1:
        dist.destroy_process_group()
    torch.cuda.empty_cache()
    return results
Example #9
0
class DDPG:

	""" input : discrete actions, state space, type of learning(Straight/left/right)"""
	def __init__(self, action_space, state_space, radar_dim, type):
		self.action_space = action_space
		self.state_space = state_space
		self.radar_space = radar_dim
		self.lower_bound = 0.0
		self.upper_bound = 0.6 # Steer limit
		self.epsilon = 0.8
		self.gamma = .99
		self.batch_size = 128
		self.epsilon_min = .1
		self.epsilon_decay = .997

		self.critic_lr = 0.006
		self.actor_lr = 0.006

		# Custom tensorboard object
		now = time.localtime()
		self.tensorboard = ModifiedTensorBoard(log_dir=f"logs/{MODEL_NAME}-Feb_{now.tm_mday}_{now.tm_min}_{now.tm_hour}_{self.radar_space}_{self.actor_lr}_{self.batch_size}")
		self.type = type

		# Networks
		# we need to share some weights in between actor <--> critic 
		# that we will do after every update
		self.actor = FeedForwardNN(self.radar_space, self.state_space, self.action_space, "actor")
		self.critic = FeedForwardNN(self.radar_space, self.state_space, 1, "critic")

		# Target model this is what we .predict against every step
		self.target_update_counter = 0
		self.target_actor = self.actor
		self.target_critic = self.critic

		# We use different np.arrays for each tuple element for replay memory
		self.buffer_capacity=50_000
		self.buffer_counter = 0;
		self.state_buffer = np.zeros((self.buffer_capacity, self.state_space))
		self.action_buffer = np.zeros((self.buffer_capacity, self.action_space))
		self.reward_buffer = np.zeros((self.buffer_capacity, 1))
		self.next_state_buffer = np.zeros((self.buffer_capacity, self.state_space))
		self.radar_buffer = np.zeros((self.buffer_capacity, self.radar_space))
		self.next_radar_buffer = np.zeros((self.buffer_capacity, self.radar_space))
		
		self.t_so_far = 0
		self.writer = SummaryWriter(log_dir=f"runs/Feb_{now.tm_mday}_{now.tm_min}_{now.tm_hour}_{self.radar_space}_{self.actor_lr}_{self.batch_size}")


	# Takes (s,a,r,s') obervation tuple as input
	def remember(self,radar_state, radar_state_next, state, action, reward, next_state, done=None):
		# Set index to zero if buffer_capacity is exceeded,
		# replacing old records
		index = self.buffer_counter % self.buffer_capacity

		self.radar_buffer[index] = radar_state
		self.next_radar_buffer[index] = radar_state_next
		self.state_buffer[index] = state
		self.action_buffer[index] = action
		self.reward_buffer[index] = reward
		self.next_state_buffer[index] = next_state

		self.buffer_counter += 1


	# policy() returns an action sampled from our Actor network plus 
	# some noise for exploration.
	def policy(self, radar_state, physical_state):
		# .squeeze() function returns a tensor with the same value as its first 
		# argument, but a different shape. It removes dimensions whose size is one. 
		if np.random.rand() <= self.epsilon:
			sampled_actions = torch.rand(1)

		else:
			sampled_actions = self.actor(radar_state, physical_state, None)
			sampled_actions = sampled_actions.detach().numpy()
			# sampled_actions = np.array([(x+1)/2 for x in sampled_actions])

		if self.epsilon > self.epsilon_min:
			self.epsilon *= self.epsilon_decay

	    # We make sure action is within bounds
	    # Clip (limit) the values in an array here b/w lower and upper bound
		legal_action = np.clip(sampled_actions, self.lower_bound, self.upper_bound)
		return np.squeeze(legal_action)


	# We compute the loss and update parameters (learn)
	def replay(self):
		# Get sampling range 
		record_range = min(self.buffer_counter, self.buffer_capacity)
		# Randomly sample indices(batch)
		batch_indices = np.random.choice(record_range, self.batch_size)

		# Convert to tensors
		state_batch = torch.tensor(self.state_buffer[batch_indices], dtype=torch.float)
		action_batch = torch.tensor(self.action_buffer[batch_indices], dtype=torch.float)
		reward_batch = torch.tensor(self.reward_buffer[batch_indices], dtype=torch.float)
		next_state_batch = torch.tensor(self.next_state_buffer[batch_indices], dtype=torch.float)
		radar_batch = torch.tensor(self.radar_buffer[batch_indices], dtype=torch.float)
		next_radar_batch = torch.tensor(self.next_radar_buffer[batch_indices], dtype=torch.float)


		"""
		``````````````````````````````````````````````````````````````````````````
		# We are missing one more step
		# We got to match some preprocess layers of actor and critic
		# Create a function and call in between the above and here too
		``````````````````````````````````````````````````````````````````````````
		"""
	
		# Setting the Actor and Critic common shared layer as mean of both
		
		tau = 0.01
		new_dict = dict(self.critic.named_parameters())
		for name, param in self.actor.named_parameters():
			if 'layer' in name:
				new_dict[name] = (tau*param.data + (1-tau)*new_dict[name])

		# old_dict = dict(self.critic.named_parameters())
		self.critic.load_state_dict(new_dict)

		new_dict = dict(self.actor.named_parameters())
		for name, param in self.critic.named_parameters():
			if 'layer' in name:
				new_dict[name] = (tau*param.data + (1-tau)*new_dict[name])

		self.actor.load_state_dict(new_dict)
		


		# Critic Network
		target_actions = self.target_actor(next_radar_batch, next_state_batch, None)
		y = reward_batch + self.gamma * self.target_critic(next_radar_batch, next_state_batch, target_actions)
		critic_value = self.critic(radar_batch, state_batch, action_batch)
		critic_loss = torch.mean((y-critic_value)**2)

		critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr)
		critic_optimizer.zero_grad()
		critic_loss.backward()
		critic_optimizer.step()

		# Actor Network
		actions = self.actor(radar_batch, state_batch, None)
		critic_value = self.critic(radar_batch, state_batch, action_batch)
		actor_loss = -torch.mean(critic_value)

		actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)
		actor_optimizer.zero_grad()
		actor_loss.backward()
		actor_optimizer.step()

		return actor_loss.detach().numpy(), critic_loss.detach().numpy()


	# This update target parameters slowly
	# Based on rate `tau`, which is much less than one ~0.001 order
	# This also logs the historgrams

	def update_target(self, tau, val):
		if(tau<1):
			new_dict = dict(self.target_critic.named_parameters())
			for name, param in self.critic.named_parameters():
			    new_dict[name].data = (param.data * tau + new_dict[name].data * (1 - tau))

			self.target_critic.load_state_dict(new_dict)

			new_dict = dict(self.target_actor.named_parameters())
			for name, param in self.actor.named_parameters():
			    new_dict[name].data = (param.data * tau + new_dict[name].data * (1 - tau))

			self.target_actor.load_state_dict(new_dict)

		else:
			self.target_critic.load_state_dict(self.critic.state_dict())
			self.target_actor.load_state_dict(self.actor.state_dict())

		
		# Log the histogram data of the Actor/Critic Network
		if(val%10==0):
			for name, param in self.actor.named_parameters():
				if 'weight' in name:
					self.writer.add_histogram("actor"+name, param.detach().numpy(), self.t_so_far)

			for name, param in self.critic.named_parameters():
				if 'weight' in name:
					self.writer.add_histogram("critic"+name, param.detach().numpy(), self.t_so_far)

			self.t_so_far += 1


	def save_model(self):
		# serialize weights to HDF5
		print("---Saved modelweights to disk---")
		# Save the weights
		torch.save(self.actor.state_dict(), str(self.type) + "_DDPGactor.pth")
		torch.save(self.critic.state_dict(), str(self.type) + "_DDPGcritic.pth")

		torch.save(self.target_actor.state_dict(), str(self.type) + "_target_actor.pth")
		torch.save(self.target_critic.state_dict(), str(self.type) + "_target_critic.pth")
Example #10
0
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Calculate 'running' training accuracy
                features = data.reshape(data.shape[0], -1)
                img_grid = torchvision.utils.make_grid(data)
                _, predictions = scores.max(1)
                num_correct = (predictions == targets).sum()
                running_train_acc = float(num_correct) / float(data.shape[0])
                accuracies.append(running_train_acc)

                # Plot things to tensorboard
                class_labels = [classes[label] for label in predictions]
                writer.add_image("mnist_images", img_grid)
                writer.add_histogram("fc1", model.fc1.weight)
                writer.add_scalar("Training loss", loss, global_step=step)
                writer.add_scalar("Training Accuracy",
                                  running_train_acc,
                                  global_step=step)

                if batch_idx == 230:
                    writer.add_embedding(
                        features,
                        metadata=class_labels,
                        label_img=data,
                        global_step=batch_idx,
                    )
                step += 1

            writer.add_hparams(
Example #11
0
def train(appliance_name,
          model,
          mains,
          appliance,
          epochs,
          batch_size,
          pretrain,
          checkpoint_interval=None,
          train_patience=3):
    # Model configuration
    if USE_CUDA:
        model = model.cuda()
    if not pretrain:
        model.apply(initialize)
    # summary(model, (1, mains.shape[1])) Wrong with torchsummary API
    # Split the train and validation set
    train_mains, valid_mains, train_appliance, valid_appliance = train_test_split(
        mains, appliance, test_size=.2, random_state=random_seed)

    # Create optimizer, loss function, and dataloadr
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = torch.nn.MSELoss(reduction='mean')

    train_dataset = TensorDataset(
        torch.from_numpy(train_mains).float().permute(0, 2, 1),
        torch.from_numpy(train_appliance).float())
    train_loader = tud.DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=0,
                                  drop_last=True)

    valid_dataset = TensorDataset(
        torch.from_numpy(valid_mains).float().permute(0, 2, 1),
        torch.from_numpy(valid_appliance).float())
    valid_loader = tud.DataLoader(valid_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=0,
                                  drop_last=True)

    writer = SummaryWriter(comment='train_visual')
    patience, best_loss = 0, None

    for epoch in range(epochs):
        # Earlystopping
        if (patience == train_patience):
            print(
                "val_loss did not improve after {} Epochs, thus Earlystopping is calling"
                .format(train_patience))
            break
        # Train the model
        model.train()

        st = time.time()
        for i, (batch_mains, batch_appliance) in enumerate(train_loader):
            if USE_CUDA:
                batch_mains = batch_mains.cuda()
                batch_appliance = batch_appliance.cuda()

            batch_pred = model(batch_mains)
            loss = loss_fn(batch_appliance, batch_pred)

            model.zero_grad()
            loss.backward()
            optimizer.step()
        ed = time.time()

        # Evaluate the model
        model.eval()
        with torch.no_grad():
            cnt, loss_sum = 0, 0
            for i, (batch_mains, batch_appliance) in enumerate(valid_loader):
                if USE_CUDA:
                    batch_mains = batch_mains.cuda()
                    batch_appliance = batch_appliance.cuda()

                batch_pred = model(batch_mains)
                loss = loss_fn(batch_appliance, batch_pred)
                loss_sum += loss
                cnt += 1

        final_loss = loss_sum / cnt
        # Save best only
        if best_loss is None or final_loss < best_loss:
            best_loss = final_loss
            patience = 0
            net_state_dict = model.state_dict()
            path_state_dict = "./" + appliance_name + "_bilstm_best_state_dict.pt"
            torch.save(net_state_dict, path_state_dict)
        else:
            patience = patience + 1
        print("Epoch: {}, Valid_Loss: {}, Time consumption: {}s.".format(
            epoch, final_loss, ed - st))
        # For the visualization of training process
        for name, param in model.named_parameters():
            writer.add_histogram(name + '_grad', param.grad, epoch)
            writer.add_histogram(name + '_data', param, epoch)
        writer.add_scalars("MSELoss", {"Valid": final_loss}, epoch)

        # Save checkpoint
        if (checkpoint_interval != None) and ((epoch + 1) % checkpoint_interval
                                              == 0):
            checkpoint = {
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epoch": epoch
            }
            path_checkpoint = "./" + appliance_name + "_bilstm_checkpoint_{}_epoch.pkl".format(
                epoch)
            torch.save(checkpoint, path_checkpoint)
Example #12
0
class RunManager:
    """
    RunManager class, keeping track of overall training progress.
    """
    def __init__(self):
        self.epoch = Epoch()
        self.run = Run()
        self.net = None
        self.images = None
        self.noisy_images = None
        self.tb = None
        self.min_val_loss = float('inf')

    def begin_run(self, hparams, net, test_images, test_noisy_images):
        # Begin next run with new hyperparameters
        self.run.begin(hparams)

        # Setup network, data and SummaryWriter
        self.net = net
        self.images = test_images
        self.noisy_images = test_noisy_images
        self.tb = SummaryWriter(comment=f'-{hparams}')

        # Add test images and graph to TensorBoard
        grid = make_grid(to_img(self.images), nrow=10)
        noisy_grid = make_grid(to_img(self.noisy_images), nrow=10)
        self.tb.add_image('original images', grid)
        self.tb.add_image('noisy images', noisy_grid)
        self.tb.add_graph(self.net, to_img(self.images))
        self.save_img(grid, 'original_images.png')
        self.save_img(noisy_grid, 'noisy_images.png')

    def end_run(self):
        self.tb.flush()
        self.tb.close()
        self.net = None
        self.images = None
        self.noisy_images = None
        self.tb = None
        self.min_val_loss = float('inf')
        self.run.end(self.epoch)

    def begin_epoch(self):
        assert self.run.active, "Run is not active, cannot initialise epoch"
        self.epoch.begin()

    def end_epoch(self):
        run_duration = self.run.duration()
        epoch_duration, train_loss, val_loss = self.epoch.end()

        self.tb.add_scalar('Training loss', train_loss, self.epoch.count)
        self.tb.add_scalar('Validation loss', val_loss, self.epoch.count)

        with torch.no_grad():
            preds = self.net(self.noisy_images)
            pred_imgs = to_img(preds)
            grid = make_grid(pred_imgs, nrow=10)
            self.tb.add_image('reconstructed images', grid, self.epoch.count)

        for name, param in self.net.named_parameters():
            self.tb.add_histogram(name, param, self.epoch.count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch.count)

        if val_loss < self.min_val_loss:
            torch.save(self.net,
                       './models/best_' + str(self.run.hparams) + '.pth')
            self.min_val_loss = val_loss
            self.save_img(grid, 'epoch{0}.png'.format(self.epoch.count))

        results = OrderedDict()
        results['run'] = self.run.count
        results['epoch'] = self.epoch.count
        results['train loss'] = train_loss
        results['validation loss'] = val_loss
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration

        for k, v in self.run.hparams._asdict().items():
            results[k] = v

        self.run.append_and_display_data(results)

    def track_loss(self, loss, batch_size, mode='train'):
        self.epoch.add_loss(loss, batch_size, mode)

    def save(self, filename):
        self.run.save(filename)

    # Save image to local directory
    def save_img(self, grid, filename):
        if not os.path.exists('./gif'):
            os.mkdir('./gif')

        plt.figure(figsize=(15, 15))
        plt.imsave('./gif/' + filename, np.transpose(grid, (1, 2, 0)).numpy())
Example #13
0
class Runner(object):
    def __init__(self,
                 net,
                 env,
                 num_envs,
                 n_stack,
                 rollout_size=5,
                 num_updates=2500000,
                 max_grad_norm=0.5,
                 value_coeff=0.5,
                 entropy_coeff=0.02,
                 tensorboard_log=False,
                 log_path="./log",
                 is_cuda=True,
                 seed=42):
        super().__init__()

        # constants
        self.num_envs = num_envs
        self.rollout_size = rollout_size
        self.num_updates = num_updates
        self.n_stack = n_stack
        self.seed = seed

        self.max_grad_norm = max_grad_norm

        # loss scaling coefficients
        self.is_cuda = torch.cuda.is_available() and is_cuda

        # objects
        """Tensorboard logger"""
        self.writer = SummaryWriter(
            comment="statistics",
            log_dir=log_path) if tensorboard_log else None
        """Environment"""
        self.env = env

        self.storage = RolloutStorage(self.rollout_size,
                                      self.num_envs,
                                      self.env.observation_space.shape[0:-1],
                                      self.n_stack,
                                      is_cuda=self.is_cuda,
                                      value_coeff=value_coeff,
                                      entropy_coeff=entropy_coeff,
                                      writer=self.writer)
        """Network"""
        self.net = net
        self.net.a2c.writer = self.writer

        if self.is_cuda:
            self.net = self.net.cuda()

        # self.writer.add_graph(self.net, input_to_model=(self.storage.states[0],)) --> not working for LSTMCEll

    def train(self):
        """Environment reset"""
        obs = self.env.reset()
        self.storage.states[0].copy_(self.storage.obs2tensor(obs))
        best_loss = np.inf

        for num_update in range(self.num_updates):

            final_value, entropy = self.episode_rollout()

            self.net.optimizer.zero_grad()
            """Assemble loss"""
            loss = self.storage.a2c_loss(final_value, entropy)
            loss.backward(retain_graph=False)

            # gradient clipping
            nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)

            if self.writer is not None:
                self.writer.add_scalar("loss", loss.item())

            self.net.optimizer.step()

            # it stores a lot of data which let's the graph
            # grow out of memory, so it is crucial to reset
            self.storage.after_update()

            if loss < best_loss:
                best_loss = loss.item()
                print("model saved with best loss: ", best_loss,
                      " at update #", num_update)
                torch.save(self.net.state_dict(), "a2c_best_loss")

            elif num_update % 10 == 0:
                print("current loss: ", loss.item(), " at update #",
                      num_update)
                self.storage.print_reward_stats()

            elif num_update % 100 == 0:
                torch.save(self.net.state_dict(), "a2c_time_log_no_norm")

            if self.writer is not None and len(
                    self.storage.episode_rewards) > 1:
                self.writer.add_histogram(
                    "episode_rewards",
                    torch.tensor(self.storage.episode_rewards))

        self.env.close()

    def episode_rollout(self):
        episode_entropy = 0
        for step in range(self.rollout_size):
            """Interact with the environments """
            # call A2C
            a_t, log_p_a_t, entropy, value, a2c_features = self.net.a2c.get_action(
                self.storage.get_state(step))
            # accumulate episode entropy
            episode_entropy += entropy

            # interact
            obs, rewards, dones, infos = self.env.step(a_t.cpu().numpy())

            # save episode reward
            self.storage.log_episode_rewards(infos)

            self.storage.insert(step, rewards, obs, a_t, log_p_a_t, value,
                                dones)
            self.net.a2c.reset_recurrent_buffers(reset_indices=dones)

        # Note:
        # get the estimate of the final reward
        # that's why we have the CRITIC --> estimate final reward
        # detach, as the final value will only be used as a
        with torch.no_grad():
            _, _, _, final_value, final_features = self.net.a2c.get_action(
                self.storage.get_state(step + 1))

        return final_value, episode_entropy
Example #14
0
                collector_reconstruction_loss.mean(),
                iteration,
            )
            writer.add_scalar("imq_mmd_average_20_obs",
                              collector_imq_mmd.mean(), iteration)
            writer.add_scalar("codes_min_over_20_obs",
                              collector_codes_min.min(), iteration)
            writer.add_scalar("codes_max_over_20_obs",
                              collector_codes_max.max(), iteration)

            if iteration % (knobs["time_to_collect"] * 4) == 0:

                it_encoder_parameters = encoder.parameters()
                for k, v in encoder.state_dict().items():
                    if k.find("bias") != -1 or k.find("weight") != -1:
                        writer.add_histogram("encoder/" + k.replace(".", "/"),
                                             v, iteration)
                        writer.add_histogram(
                            "encoder/" + k.replace(".", "/") + "/grad",
                            next(it_encoder_parameters).grad,
                            iteration,
                        )
                it_decoder_parameters = decoder.parameters()
                for k, v in decoder.state_dict().items():
                    if k.find("bias") != -1 or k.find("weight") != -1:
                        writer.add_histogram("decoder/" + k.replace(".", "/"),
                                             v, iteration)
                        writer.add_histogram(
                            "decoder/" + k.replace(".", "/") + "/grad",
                            next(it_decoder_parameters).grad,
                            iteration,
                        )
Example #15
0
comment = f' batch_size={batch_size} lr={lr}'
tb = SummaryWriter(comment=comment)
tb.add_image('images', grid)
tb.add_graph(network, images)

for epoch in range(1):
    total_loss = 0
    total_correct = 0
    for batch in train_loader:
        images, labels = batch  # Get Batch
        preds = network(images)  # Pass Batch
        loss = F.cross_entropy(preds, labels)  # Calculate Loss
        optimizer.zero_grad()  # Zero Gradients
        loss.backward()  # Calculate Gradients
        optimizer.step()  # Update Weights

        total_loss += loss.item() * batch_size
        total_correct += get_num_correct(preds, labels)

    tb.add_scalar('Loss', total_loss, epoch)
    tb.add_scalar('Number Correct', total_correct, epoch)
    tb.add_scalar('Accuracy', total_correct / len(train_set), epoch)

    for name, param in network.named_parameters():
        tb.add_histogram(name, param, epoch)
        tb.add_histogram(f'{name}.grad', param.grad, epoch)

    print("epoch", epoch, "total_correct:", total_correct, "loss:", total_loss)
tb.close()
Example #16
0
class MRTR():
    def __init__(self, config):
        self.config = config
        self.iteration = 0

        self.debug = False
        self.maskpreinpaint_model = MaskInpaintModel(config).to(config.DEVICE)

        self.psnr = PSNR(255.0).to(config.DEVICE)
        self.ssim = SSIM(5, reduction='mean')
        self.mse = torch.nn.MSELoss()
        self.maskacc = EdgeAccuracy(config.EDGE_THRESHOLD).to(config.DEVICE)

        # test mode
        if self.config.MODE == 2 or self.config.MODE == 4:
            self.test_dataset = Dataset(config,
                                        config.TEST_DATA,
                                        augment=False,
                                        training=False)
        else:
            # Create tfboard summary writer
            self.val_info = None
            self.is_best = True
            self.writer = SummaryWriter(self.config.LOG_DIR)
            self.train_dataset = Dataset(config,
                                         config.TRAIN_DATA,
                                         augment=True,
                                         training=True)
            self.val_dataset = Dataset(config,
                                       config.VAL_DATA,
                                       augment=False,
                                       training=True)
            self.sample_iterator = self.val_dataset.create_iterator(
                config.SAMPLE_SIZE)

        self.samples_path = os.path.join(config.MODEL_DIR, 'samples')
        self.results_path = config.TEST_DIR

        if config.RESULTS is not None:
            self.results_path = os.path.join(config.RESULTS)

        if config.DEBUG is not None and config.DEBUG != 0:
            self.debug = True

        self.log_file = os.path.join(config.PATH, 'log.dat')

    def load(self):
        self.maskpreinpaint_model.load()

    def save(self):
        self.maskpreinpaint_model.save()

    def train(self):
        train_loader = DataLoader(dataset=self.train_dataset,
                                  batch_size=self.config.BATCH_SIZE,
                                  num_workers=4,
                                  drop_last=True,
                                  pin_memory=True,
                                  shuffle=True)
        epoch = 0
        keep_training = True
        max_iteration = int(float(self.config.MAX_ITERS))

        if len(self.train_dataset) == 0:
            print(
                'No training data was provided! Check \'TRAIN_DATA\' value in the configuration file.'
            )
            return

        iteration = 0
        while keep_training:
            epoch += 1
            progbar = Progbar(stateful_metrics=['step'])
            for items in train_loader:
                self.maskpreinpaint_model.train()
                images, images_gt, masks, masks_gt, masks_refine_gt = self.get_inputs(
                    items)
                # train
                prob = np.minimum(
                    self.config.MASK_SWITCH_RATIO,
                    np.ceil(iteration / self.config.MASK_SWITCH_STEP) / 10)
                use_gt_mask = False if np.random.binomial(1, prob) else True
                images_gen, pre_images_gen, masks_gen, gen_loss, dis_loss, logs = \
                    self.maskpreinpaint_model.process(images, images_gt, masks, masks_gt, masks_refine_gt,
                                                      use_gt_mask=use_gt_mask)
                masks_cmp = masks_gt if use_gt_mask else masks_gen * masks
                images_cmp = self.get_complete_preinpaint(
                    masks_cmp, images, images_gen)
                pre_images_cmp = self.get_complete_preinpaint(
                    masks_cmp, images, pre_images_gen)
                # backward
                self.maskpreinpaint_model.backward(gen_loss, dis_loss)
                iteration = self.maskpreinpaint_model.iteration
                # Tensorboard record: scala
                if iteration % self.config.SAVE_SCALR_AT_STEP == 0:
                    self._write_logs(logs, iteration)

                if iteration % self.config.SAVE_HIST_AT_STEP == 0:
                    for name, value in self.maskpreinpaint_model.named_parameters(
                    ):
                        self.writer.add_histogram(
                            'MaskPreinpaint_weight/' + name, value, iteration)
                        if value.grad is not None:
                            self.writer.add_histogram(
                                'MaskPreinpaint_grad/' + name, value.grad.data,
                                iteration)

                # Tensorboard record: image
                if iteration % self.config.SAVE_IMAGE_AT_STEP == 0:
                    image = self.get_tensorboard_image([
                        images, images_gt,
                        self.gray2rgb(masks),
                        self.gray2rgb(masks_refine_gt),
                        self.gray2rgb(masks_gen),
                        self.gray2rgb(masks_cmp), pre_images_gen,
                        pre_images_cmp, images_gen, images_cmp
                    ])

                    self.writer.add_image('Train/', image, iteration)

                if iteration % self.config.PRINT_AT_STEP == 0:
                    logs = [
                        ("step", str(epoch) + "/" + str(iteration)),
                    ] + logs
                    progbar.print_cur(self.config.PRINT_AT_STEP, values=logs)

                self.iteration = iteration
                self._run_steps_after_train(logs)

                if iteration >= max_iteration:
                    keep_training = False
                    break
            self.maskpreinpaint_model.gen_scheduler.step()
            self.maskpreinpaint_model.dis_scheduler.step()

        self.writer.close()
        print('\nEnd training....')

    def eval(self):
        if self.config.MODE == 1 or self.config.MODE == 3:
            val_loader = DataLoader(dataset=self.val_dataset,
                                    batch_size=self.config.BATCH_SIZE,
                                    drop_last=True,
                                    shuffle=True)
            total = len(self.val_dataset)
        else:
            val_loader = DataLoader(dataset=self.test_dataset,
                                    batch_size=self.config.BATCH_SIZE,
                                    drop_last=False,
                                    shuffle=False)
            total = len(self.test_dataset)
            self.config.N_EVAL = 7

        self.maskpreinpaint_model.eval()

        logs = []
        i_logs = []
        progbar = Progbar(total, stateful_metrics=['it'])
        with torch.no_grad():
            for _iteration, items in enumerate(val_loader):
                images, images_gt, masks, masks_gt, masks_refine_gt = self.get_inputs(
                    items)  # edge model
                images_gen, pre_images_gen, masks_gen, gen_loss, dis_loss, logs = \
                    self.maskpreinpaint_model.process(images, images_gt, masks, masks_gt, masks_refine_gt,
                                                      use_gt_mask=self.config.EVAL_USE_GT_MASK)
                masks_cmp = masks_gen * masks
                images_cmp = self.get_complete_preinpaint(
                    masks_cmp, images, images_gen)
                pre_images_cmp = self.get_complete_preinpaint(
                    masks_cmp, images, pre_images_gen)

                #mask_blur = mask.filter(ImageFilter.GaussianBlur(10))
                #im = Image.composite(im1, im2, mask_blur)

                # metrics
                psnr = self.psnr(self.postprocess(images_gt),
                                 self.postprocess(images_gen))
                mae = (torch.sum(torch.abs(images_gt - images_gen)) /
                       torch.sum(images_gt)).float()
                psnr_cmp = self.psnr(self.postprocess(images_gt),
                                     self.postprocess(images_cmp))
                mae_cmp = (torch.sum(torch.abs(images_gt - images_cmp)) /
                           torch.sum(images_gt)).float()
                logs.append(('psnr', psnr.item()))
                logs.append(('mae', mae.item()))
                logs.append(('psnr_cmp', psnr_cmp.item()))
                logs.append(('mae_cmp', mae_cmp.item()))

                # metrics
                psnr = self.psnr(self.postprocess(images_gt),
                                 self.postprocess(pre_images_gen))
                mae = (torch.sum(torch.abs(images_gt - pre_images_gen)) /
                       torch.sum(images_gt)).float()
                psnr_cmp = self.psnr(self.postprocess(images_gt),
                                     self.postprocess(pre_images_cmp))
                mae_cmp = (torch.sum(torch.abs(images_gt - pre_images_cmp)) /
                           torch.sum(images_gt)).float()
                logs.append(('pre_psnr', psnr.item()))
                logs.append(('pre_mae', mae.item()))
                logs.append(('pre_psnr_cmp', psnr_cmp.item()))
                logs.append(('pre_mae_cmp', mae_cmp.item()))

                ssim = self.ssim(images_gt, images_gen)
                ssim_cmp = self.ssim(images_gt, images_cmp)
                mse = self.mse(images_gt, images_gen)
                mse_cmp = self.mse(images_gt, images_cmp)
                logs.append(('ssim', (1 - ssim.item()) * 100))
                logs.append(('ssim_cmp', (1 - ssim_cmp.item()) * 100))
                logs.append(('mse', mse.item()))
                logs.append(('mse_cmp', mse_cmp.item()))

                ssim = self.ssim(images_gt, pre_images_gen)
                ssim_cmp = self.ssim(images_gt, pre_images_cmp)
                mse = self.mse(images_gt, pre_images_gen)
                mse_cmp = self.mse(images_gt, pre_images_cmp)
                logs.append(('pre_ssim', (1 - ssim.item()) * 100))
                logs.append(('pre_ssim_cmp', (1 - ssim_cmp.item()) * 100))
                logs.append(('pre_mse', mse.item()))
                logs.append(('pre_mse_cmp', mse_cmp.item()))

                # Hack: name of edgeacc
                mask_precision, mask_recall = self.maskacc(
                    masks_refine_gt * masks, masks_cmp)
                logs.append(('M_P', mask_precision.item()))
                logs.append(('M_R', mask_recall.item()))

                logs = logs + i_logs
                progbar.add(len(images),
                            values=logs if self.config.VERBOSE else [
                                x for x in logs if not x[0].startswith('l_')
                                and not x[0].startswith('d_')
                            ])
                # print(_iteration)

                if _iteration >= self.config.N_EVAL - 1:
                    break

        # Print the average values
        progbar.print_info()

        images = self.get_tensorboard_image([
            images, images_gt,
            self.gray2rgb(masks),
            self.gray2rgb(masks_refine_gt),
            self.gray2rgb(masks_gen),
            self.gray2rgb(masks_cmp), pre_images_gen, pre_images_cmp,
            images_gen, images_cmp
        ])

        if self.config.MODE == 1 or self.config.MODE == 3:
            # Writing to tfboard summary
            _val_info = {}
            # TODO: ensure following code is correct
            for item, value in progbar.get_average_log_values().items():
                if not item.startswith('l_'):
                    self.writer.add_scalar('Validation/' + item, value,
                                           self.iteration)
                    _val_info[item] = value

            # if self.val_info is None:
            #     self.val_info = _val_info
            #     self.is_best = True
            # else:
            #     if self.config.MODEL !=1 :
            #         if _val_info['psnr_cmp'] > self.val_info['psnr_cmp']:
            #             self.val_info = _val_info
            #             self.is_best = True
            #     elif self.config.MODEL == 1 :
            #         # Hack: only looked at mask recall, this might be ugly
            #         if _val_info['M_R'] > self.val_info['M_R']:
            #             self.val_info = _val_info
            #             self.is_best = True
            #     else:
            #         raise
            # get_tensorboard_image(self, img_list)
            # images = vutils.make_grid(images[0], normalize=True, scale_each=True)
            self.writer.add_image('Validation/', images, self.iteration)

    def test(self):
        self.maskpreinpaint_model.eval()
        create_dir(self.results_path)

        test_loader = DataLoader(
            dataset=self.test_dataset,
            batch_size=1,
        )
        ### !!! FIX TEST
        index = 0
        for items in test_loader:
            name = self.test_dataset.load_name(index)
            images, images_gt, masks, masks_gt, masks_refine_gt = self.get_inputs(
                items)
            index += 1

            output_images, output_pre_images, output_masks = self.maskpreinpaint_model(
                images, masks)
            output_masks_cmp = output_masks
            output_images_cmp = self.get_complete_preinpaint(
                output_masks_cmp, images, output_images)
            output_pre_images_cmp = self.get_complete_preinpaint(
                output_masks_cmp, images, output_pre_images)

            outputs = self.postprocess(output_images)[0]
            outputs_cmp = self.postprocess(output_images_cmp)[0]
            path = os.path.join(self.results_path, name)
            tsplit = name.split('.')
            path_cmp = os.path.join(self.results_path,
                                    '%s_cmp.%s' % (tsplit[0], tsplit[1]))
            print(index, name)

            imsave(outputs, path)
            imsave(outputs_cmp, path_cmp)

            if self.debug:
                input_mask = self.postprocess(masks)[0]
                output_mask = self.postprocess(output_masks)[0]
                images = self.postprocess(images)[0]
                fname, fext = name.split('.')

                imsave(
                    images,
                    os.path.join(self.results_path, fname + '_input.' + fext))
                imsave(
                    input_mask,
                    os.path.join(self.results_path,
                                 fname + '_input_mask.' + fext))
                imsave(
                    output_mask,
                    os.path.join(self.results_path,
                                 fname + '_output_mask.' + fext))

        print('\nEnd test....')

    def sample(self, it=None):
        # do not sample when validation set is empty
        if len(self.val_dataset) == 0:
            return

        self.maskpreinpaint_model.eval()

        model = self.config.MODEL
        items = next(self.sample_iterator)

        images, images_gt, masks, masks_gt, masks_refine_gt = self.get_inputs(
            items)

        image_per_row = 1
        if self.config.SAMPLE_SIZE <= 6:
            image_per_row = 1

        # edge model
        iteration = self.maskpreinpaint_model.iteration
        output_images, output_pre_images, output_masks = self.maskpreinpaint_model(
            images, masks)
        output_masks_cmp = output_masks * masks
        output_images_cmp = self.get_complete_preinpaint(
            output_masks_cmp, images, output_images)
        output_pre_images_cmp = self.get_complete_preinpaint(
            output_masks_cmp, images, output_pre_images)

        images = stitch_images(self.postprocess(images),
                               self.postprocess(masks),
                               self.postprocess(masks_refine_gt),
                               self.postprocess(output_masks),
                               self.postprocess(output_masks_cmp),
                               self.postprocess(output_pre_images),
                               self.postprocess(output_pre_images_cmp),
                               self.postprocess(output_images),
                               self.postprocess(output_images_cmp),
                               img_per_row=image_per_row)

        if it is not None:
            iteration = it

        path = os.path.join(self.samples_path)
        name = os.path.join(path, str(iteration).zfill(5) + ".png")
        create_dir(path)
        print('\nsaving sample ' + name)
        images.save(name)

    def log(self, logs):
        with open(self.log_file, 'a') as f:
            f.write('%s\n' % ' '.join([str(item[1]) for item in logs]))

    def cuda(self, *args):
        return (item.to(self.config.DEVICE) for item in args)

    def postprocess(self, img):
        # [0, 1] => [0, 255]
        img = img * 255.0
        img = img.permute(0, 2, 3, 1)
        return img.int()

    def gray2rgb(self, img):
        return torch.cat([img] * 3, dim=1)

    def _run_steps_after_train(self, logs):
        """

        Args:
            logs:

        Returns:

        """
        # log model at checkpoints
        if self.config.LOG_INTERVAL and self.iteration % self.config.LOG_INTERVAL == 0:
            self.log(logs)

        # sample model at checkpoints
        if self.config.SAMPLE_INTERVAL and self.iteration % self.config.SAMPLE_INTERVAL == 0:
            self.sample()

        is_finish_eval = False
        # evaluate model at checkpoints
        if self.config.EVAL_INTERVAL and self.iteration % self.config.EVAL_INTERVAL == 0:
            print('...Eval....\n')
            self.eval()
            print('...\n')
            is_finish_eval = True

        # # save model at checkpoints
        # if self.config.SAVE_INTERVAL and self.iteration % self.config.SAVE_INTERVAL == 0:
        #     if is_finish_eval:
        #         if self.is_best:
        #             print('...Saving model....')
        #             self.save()
        #     else:
        #         print('...Eval....\n')
        #         self.eval()
        #         print('...\n')
        #         if self.is_best:
        #             print('...Saving model....')
        #             self.save()

        # save model at checkpoints
        if self.config.SAVE_INTERVAL and self.iteration % self.config.SAVE_INTERVAL == 0:
            if is_finish_eval:
                print('...Saving model....')
                self.save()
            else:
                print('...Eval....\n')
                self.eval()
                print('...\n')
                print('...Saving model....')
                self.save()

    def get_inputs(self, items):
        # if self.config.WITH_EDGE:
        images, images_gt, masks, masks_refine_gt, masks_refine_gt = self.cuda(
            *items)
        return images, images_gt, masks, masks_refine_gt, masks_refine_gt

    def get_tensorboard_image(self, img_list):
        col = 5
        images = torch.cat(img_list, dim=1)
        images = images[0]
        images = images.view((len(img_list), -1, 256, 256))
        image = vutils.make_grid(images,
                                 nrow=col,
                                 normalize=False,
                                 scale_each=True)

        # import matplotlib.pyplot as plt
        # npgrid = image.cpu().detach().numpy()
        # plt.imshow(np.transpose(npgrid, (1, 2, 0)), interpolation='nearest')
        # plt.savefig('out.png')
        return image

    def _write_logs(self, logs, iteration):
        for item, value in logs:
            if item.startswith("l_"):
                self.writer.add_scalar('Train/loss/' + item, value, iteration)
            elif item.startswith("d_"):
                self.writer.add_scalar('Train/diff/' + item, value, iteration)
            else:
                self.writer.add_scalar('Train/' + item, value, iteration)

    def get_auxiliary_with_groundtruth(self, masks, masks_refine_gt, images,
                                       images_gt):
        # !!! edge, mask order should be edge, mask. Cant be switched
        auxiliary = torch.cat([images, masks], dim=1)
        auxiliary_gt = torch.cat([images_gt, masks_refine_gt], dim=1)
        return auxiliary, auxiliary_gt

    def get_auxiliary(self, masks, images):
        auxiliary = torch.cat([images, masks], dim=1)
        return auxiliary

    def get_complete_preinpaint(self, mask, input, input_gen):
        output_cmp = (input_gen * mask) + (input * (1 - mask))
        return output_cmp
Example #17
0
    writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], total_numsteps)
    writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], total_numsteps)
    writer.add_scalar("Scores/total", env.cost()['total'], total_numsteps)

    # Append the total score/reward to the list
    score_list.append(env.cost()['total'])
    reward_list.append(episode_reward)

    # Log how much storage is utilised by calculating abs sum of actions (CHECK IF WORKS WITH MULTIPLE BUILDINGS!!!)
    episode_actions = np.array(agent.action_tracker[-8759:])
    cooling = sum(abs(episode_actions[:,0]))
    writer.add_scalar("Action/Cooling", cooling, total_numsteps)
    if agent.act_size[0] == 2:
        dhw = sum(abs(episode_actions[:,1]))
        writer.add_scalar("Action/DHW", dhw, total_numsteps)
    writer.add_histogram("Action/Tracker", np.array(agent.action_tracker), total_numsteps)
            
    print("Episode: {}, total numsteps: {}, total cost: {}, reward: {}".format(i_episode, total_numsteps, round(env.cost()['total'],5), round(episode_reward, 2)))

    # Save trained Actor and Critic network periodically as a checkpoint if it's the best model achieved
    if i_episode % args.checkpoint_interval == 0:
        if env.cost()['total'] < best_reward:
            best_reward = env.cost()['total']
            print("Saving new best model to {}".format(parent_dir))
            agent.save_model(parent_dir)

    # If training episodes completed
    if i_episode > args.num_episodes - 1:
        break

env.close()
Example #18
0
from day02.net import *
from torch.utils.tensorboard import SummaryWriter
import cv2
net = NetV2()
net.load_state_dict(torch.load("./checkpoint/2.t"))
summaryWriter = SummaryWriter("./logs")
layer1_weight = net.sequential[0].weight
layer2_weight = net.sequential[4].weight
layer3_weight = net.sequential[8].weight

summaryWriter.add_histogram("layer1_weight", layer1_weight)
summaryWriter.add_histogram("layer2_weight", layer2_weight)
summaryWriter.add_histogram("layer3_weight", layer3_weight)
cv2.waitKey(0)
                loss.backward()
                optimizer.step()

                # calculate 'running' training accuracy
                _, predictions = scores.max(1)
                num_correct = (predictions == targets).sum()
                running_train_acc = float(num_correct) / float(data.shape[0])
                accuracies.append(running_train_acc)

                features = data.reshape(data.shape[0], -1)
                class_labels = [classes[label] for label in predictions]

                # visualizing data and weights of fc1 for each batch
                img_grid = torchvision.utils.make_grid(data)
                writer.add_image('mnist_images', img_grid)
                writer.add_histogram('fc1', model.fc1.weight)

                # data shape is [batch_size, 1, 28, 28]
                # plot things to tensorboard
                writer.add_scalar('Training Loss', loss, global_step=step)
                writer.add_scalar('Training Accuracy',
                                  running_train_acc,
                                  global_step=step)

                # PCA for images
                if batch_idx == 100:
                    writer.add_embedding(features,
                                         metadata=class_labels,
                                         label_img=data,
                                         global_step=batch_idx)
Example #20
0
def train(train_x,
          train_y,
          validate_x,
          validate_y,
          num_epoch,
          encoder_optimizer,
          decoder_optimizer,
          learning_rate,
          num_runs,
          save_to_file=False,
          write_summary=False):

    if write_summary:
        writer = SummaryWriter()
        writer.add_scalar('train/learning_rate', learning_rate)

    running_loss = 0

    for epoch in range(num_epoch):
        # shuffle our training set
        num_train_sentences = len(train_x)
        shuffled_train_indexes = random.sample(range(num_train_sentences),
                                               num_train_sentences)

        for train_idx in range(num_train_sentences):
            pair_idx = shuffled_train_indexes[train_idx]
            # convert both input and output into vocabulary indexes
            input_vector_x = utils.convert_vector_word2idx(
                train_x[pair_idx], x_word2idx_dict)
            target_vector_y = utils.convert_vector_word2idx(
                train_y[pair_idx], y_word2idx_dict)
            # input_vector_x = utils.convert_vector_word2idx(x_sentences[pair_idx], x_word2idx_dict)
            # target_vector_y = utils.convert_vector_word2idx(y_sentences[pair_idx], y_word2idx_dict)
            # print(str(input_vector_x) + ' ' + str(target_vector_y))
            # print(str(x_sentences[pair_idx]) + ' ' + str(y_sentences[pair_idx]), end='----\n')

            input_tensor_x = torch.tensor(input_vector_x).view(
                -1,
                1)  # -> [ seq_len, input_size = 1 since it's just a number ]
            input_tensor_x = input_tensor_x.unsqueeze(0).cuda().type(
                torch.cuda.LongTensor)  # to create the batch size and length
            target_tensor_y = torch.tensor(target_vector_y).view(-1, 1)
            target_tensor_y = target_tensor_y.unsqueeze(0).cuda().type(
                torch.cuda.LongTensor)

            _, (last_hidden, _) = encoder(input_tensor_x)

            total_loss, output_words = decoder(last_hidden, y=target_tensor_y)
            # since we need to take into account also the length of the targets we need to divide the total loss
            # with the length of the target sequence,
            # HOWEVER, we notice that the BACKPROPAGATION occurs on the total loss, we do not divide it!
            total_loss.backward()

            # drawing graph

            # writer.add_graph(encoder, (input_tensor_x))
            # writer.add_graph(decoder, (encoder_hidden, target_tensor_y))

            # -------------------------

            running_loss += total_loss.item() / len(target_vector_y)
            iter = epoch * num_train_sentences + train_idx + 1
            if iter % num_runs == 0:
                avg_loss = running_loss / num_runs
                print(
                    str(iter / (num_epoch * num_train_sentences) * 100) +
                    " % it: " + str(iter) + " avg loss: " + str(avg_loss))
                running_loss = 0

                # compute norm of all gradients in encoder
                list_of_grad = []
                for module in encoder._modules.values():
                    for params in module._parameters.values():
                        if params.grad is not None:
                            # print(params.grad)
                            # flatten the tensor of weights
                            list_of_grad = list_of_grad + list(
                                torch.flatten(params.grad))
                # print('encoder parameters with gradient length: ' + str(len(list_of_grad)))
                encoder_grad_mean = torch.mean(torch.tensor(list_of_grad))
                # do the same for the decoder
                list_of_grad = []
                for module in encoder._modules.values():
                    for params in module._parameters.values():
                        if params.grad is not None:
                            # print(params.grad)
                            # flatten the tensor of weights
                            list_of_grad = list_of_grad + list(
                                torch.flatten(params.grad))
                # print('decoder parameters with gradient length: ' + str(len(list_of_grad)))
                decoder_grad_mean = torch.mean(torch.tensor(list_of_grad))

                print("Input: " + str(train_x[pair_idx]) + " Gt: " + str(train_y[pair_idx]) + \
                      " Output word: " + str(utils.convert_vector_idx2word(output_words, y_idx2word_list)))

                if write_summary:
                    writer.add_scalar('train/encoder_gradient',
                                      encoder_grad_mean, iter)
                    writer.add_scalar('train/decoder_gradient',
                                      decoder_grad_mean, iter)
                    writer.add_scalar('train/loss', avg_loss, iter)

                    writer.add_histogram(
                        'train_hist_encoder/encoder_embedding_weights',
                        encoder._modules['embedding'].weight, iter)
                    # writer.add_histogram('train_hist_encoder/encoder_out_fc_weights', encoder._modules['out_fc'].weight, iter)
                    # writer.add_histogram('train_hist_encoder/encoder_out_fc_bias', encoder._modules['out_fc'].bias, iter)
                    writer.add_histogram(
                        'train_hist_encoder/encoder_lstm_weights_hh_l0',
                        encoder._modules['lstm'].weight_hh_l0, iter)
                    writer.add_histogram(
                        'train_hist_encoder/encoder_lstm_bias_hh_l0',
                        encoder._modules['lstm'].bias_hh_l0, iter)
                    writer.add_histogram(
                        'train_hist_encoder/encoder_lstm_weights_ih_l0',
                        encoder._modules['lstm'].weight_ih_l0, iter)
                    writer.add_histogram(
                        'train_hist_encoder/encoder_lstm_bias_ih_l0',
                        encoder._modules['lstm'].bias_ih_l0, iter)

                    writer.add_histogram(
                        'train_hist_decoder/decoder_embedding_weights',
                        decoder._modules['embedding'].weight, iter)
                    writer.add_histogram(
                        'train_hist_decoder/decoder_out_fc_weights',
                        decoder._modules['out_fc'].weight, iter)
                    writer.add_histogram(
                        'train_hist_decoder/decoder_out_fc_bias',
                        decoder._modules['out_fc'].bias, iter)
                    writer.add_histogram(
                        'train_hist_decoder/decoder_lstm_weights_hh_l0',
                        decoder._modules['lstm'].weight_hh_l0, iter)
                    writer.add_histogram(
                        'train_hist_decoder/decoder_lstm_bias_hh_l0',
                        decoder._modules['lstm'].bias_hh_l0, iter)
                    writer.add_histogram(
                        'train_hist_decoder/decoder_lstm_weights_ih_l0',
                        decoder._modules['lstm'].weight_ih_l0, iter)
                    writer.add_histogram(
                        'train_hist_decoder/decoder_lstm_bias_ih_l0',
                        decoder._modules['lstm'].bias_ih_l0, iter)
                    # writer.add_scalar('Accuracy/train', np.random.random(), iter)

            encoder_optimizer.step()
            decoder_optimizer.step()

    if save_to_file:
        torch.save(encoder.state_dict(),
                   './encoder_100k_iter_lr' + str(learning_rate) + '.pth')
        torch.save(decoder.state_dict(),
                   './decoder_100k_iter_lr' + str(learning_rate) + '.pth')
    if write_summary:
        writer.close()
Example #21
0
            optim.step()

            if i % 10 == 0:
                # keep some parameters for debugging
                phi_gmm_unpacked = model.unpack_recognition_gmm(phi_gmm)
                u_mu, u_cov = gaussian.natural_to_standard(
                    phi_gmm_unpacked[0], phi_gmm_unpacked[1])
                #wmse = weighted_mse(data, y_reconstruction[0].detach().cpu(), torch.exp(log_z_given_y_phi).detach().cpu())
                #print ("Training wmse {}".format(wmse))
                #glogliki = diagonal_gaussian_logprob(data.view(-1, 784).to(device), y_reconstruction[0].detach(), y_reconstruction[1].detach(), log_z_given_y_phi.detach())
                #print ("Training diagonal gaussian logprob {}".format(glogliki))
                #if i % 100 == 0:
                #plot_grad_flow(model.named_parameters(), global_step)
                if i % 100 == 0:
                    writer.add_histogram('logz',
                                         torch.exp(log_z_given_y_phi),
                                         global_step=global_step)
                    for index, (name,
                                kernel) in enumerate(model.named_parameters()):
                        writer.add_histogram('{}_grad'.format(name),
                                             kernel.grad,
                                             global_step=global_step)
                    writer.add_embedding(u_mu,
                                         tag='mu_phi_gmm',
                                         global_step=global_step)
                    #writer.add_embedding(u_cov,tag='cov_phi_gmm')
                    writer.add_histogram('pi_phi_gmm',
                                         torch.exp(phi_gmm_unpacked[-1]),
                                         global_step=global_step)

                    beta_k, m_k, C_k, v_k = niw.natural_to_standard(
Example #22
0
class Logger(object):
    def __init__(self,
                 log_dir,
                 save_tb=False,
                 log_frequency=10000,
                 agent='sac'):
        self._log_dir = log_dir
        self._log_frequency = log_frequency
        if save_tb:
            tb_dir = os.path.join(log_dir, 'tb')
            if os.path.exists(tb_dir):
                try:
                    shutil.rmtree(tb_dir)
                except:
                    print("logger.py warning: Unable to remove tb directory")
                    pass
            self._sw = SummaryWriter(tb_dir)
        else:
            self._sw = None
        # each agent has specific output format for training
        assert agent in AGENT_TRAIN_FORMAT
        train_format = COMMON_TRAIN_FORMAT + AGENT_TRAIN_FORMAT[agent]
        self._train_mg = MetersGroup(os.path.join(log_dir, 'train'),
                                     formating=train_format)
        self._eval_mg = MetersGroup(os.path.join(log_dir, 'eval'),
                                    formating=COMMON_EVAL_FORMAT)

    def _should_log(self, step, log_frequency):
        log_frequency = log_frequency or self._log_frequency
        return step % log_frequency == 0

    def _try_sw_log(self, key, value, step):
        if self._sw is not None:
            self._sw.add_scalar(key, value, step)

    def _try_sw_log_video(self, key, frames, step):
        if self._sw is not None:
            frames = torch.from_numpy(np.array(frames))
            frames = frames.unsqueeze(0)
            self._sw.add_video(key, frames, step, fps=30)

    def _try_sw_log_histogram(self, key, histogram, step):
        if self._sw is not None:
            self._sw.add_histogram(key, histogram, step)

    def log(self, key, value, step, n=1, log_frequency=1):
        if not self._should_log(step, log_frequency):
            return
        assert key.startswith('train') or key.startswith('eval')
        if type(value) == torch.Tensor:
            value = value.item()
        self._try_sw_log(key, value / n, step)
        mg = self._train_mg if key.startswith('train') else self._eval_mg
        mg.log(key, value, n)

    def log_param(self, key, param, step, log_frequency=None):
        if not self._should_log(step, log_frequency):
            return
        self.log_histogram(key + '_w', param.weight.data, step)
        if hasattr(param.weight, 'grad') and param.weight.grad is not None:
            self.log_histogram(key + '_w_g', param.weight.grad.data, step)
        if hasattr(param, 'bias') and hasattr(param.bias, 'data'):
            self.log_histogram(key + '_b', param.bias.data, step)
            if hasattr(param.bias, 'grad') and param.bias.grad is not None:
                self.log_histogram(key + '_b_g', param.bias.grad.data, step)

    def log_video(self, key, frames, step, log_frequency=None):
        if not self._should_log(step, log_frequency):
            return
        assert key.startswith('train') or key.startswith('eval')
        self._try_sw_log_video(key, frames, step)

    def log_histogram(self, key, histogram, step, log_frequency=None):
        if not self._should_log(step, log_frequency):
            return
        assert key.startswith('train') or key.startswith('eval')
        self._try_sw_log_histogram(key, histogram, step)

    def dump(self, step, save=True, ty=None):
        if ty is None:
            self._train_mg.dump(step, 'train', save)
            self._eval_mg.dump(step, 'eval', save)
        elif ty == 'eval':
            self._eval_mg.dump(step, 'eval', save)
        elif ty == 'train':
            self._train_mg.dump(step, 'train', save)
        else:
            raise f'invalid log type: {ty}'
Example #23
0
        for xs, ys, xn, yn in dev:

            xs, xn = model(xs, xn)

            xs = xs.exp().view(-1, len(labels))

            prediction.append(xs.argmax(1).cpu())
            prior += xs.sum(dim=0)

            dev.set_description('Epoch %d Prior %.5f' %
                                (epoch, prior.std().item()))

        prediction = torch.cat(prediction)
        prior = (prior / prediction.size(0)).log() / temperature

        writer.add_histogram('Prediction',
                             prediction[prediction != labels.blank()], epoch)
        writer.add_histogram('Prior', prior, epoch)

        for xs, ys, xn, yn in test:

            xs, xn = model(xs, xn)

            loss1 = ctc_loss(xs, ys, xn, yn).mean()

            loss2 = -(xs.exp() * xs).sum(dim=-1).mean()

            err.update(loss1.item())
            ent.update(loss2.item())

            xs = xs - prior
            xs = xs.argmax(2).t().type(torch.int)
Example #24
0
def train(model_type=ModelType.LINEAR, batch_size=128, num_epochs=2, learning_rate=0.1):
    trainset = torchvision.datasets.MNIST(
        root='./data',
        train=True,
        download=True,
        transform=transforms.ToTensor()
    )
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)

    testset = torchvision.datasets.MNIST(
        root='./data',
        train=False,
        download=True,
        transform=transforms.ToTensor()
    )
    testLoader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print('training on %s' % device)

    module = importlib.import_module("bn")
    class_ = getattr(module, model_type.value)
    model = class_(device)

    model.to(device)

    loss_fn = nn.CrossEntropyLoss()
    opt = optim.SGD(model.parameters(), lr=learning_rate)

    loss_arr = []

    writer = SummaryWriter(log_dir='runs/%s_%s' % (model_type.value, datetime.now().strftime("%H:%M:%S")))

    for epoch in range(num_epochs):

        for i, data in enumerate(trainloader, 0):
            model.train()
            n_iter = (epoch * len(trainloader)) + i

            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            opt.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            opt.step()

            loss_arr.append(loss.item())

            writer.add_scalar('training/loss', loss.item(), n_iter)
            writer.add_scalar('inputs/layer1/mean', model.l1_inp.cpu().numpy().mean(), n_iter)
            writer.add_scalar('inputs/layer2/mean', model.l2_inp.cpu().numpy().mean(), n_iter)
            writer.add_histogram('inputs/layer1/dist', model.l1_inp.cpu().numpy(), n_iter)
            writer.add_histogram('inputs/layer2/dist', model.l2_inp.cpu().numpy(), n_iter)

            if i % 10 == 0:
                inputs = inputs.view(inputs.size(0), -1)

                model.eval()
                print('training loss: %0.2f' % loss.item())

                model.eval()
                test_loss = 0
                correct = 0
                with torch.no_grad():
                    for test_data, test_target in testLoader:
                        test_data = test_data.to(device)
                        test_target = test_target.to(device)
                        output = model(test_data)
                        test_loss += loss_fn(output, test_target)
                        pred = output.argmax(dim=1, keepdim=True)
                        correct += pred.eq(test_target.view_as(pred)).sum().item()

                test_loss /= len(testLoader.dataset)

                writer.add_scalar('testing/loss', test_loss, n_iter)
                writer.add_scalar('testing/accuracy', correct/len(testLoader.dataset) * 100., n_iter)



    # compute summary
    l1_mean = [x[0].cpu() for x in model.l1_dist]
    l1_std = [x[1].cpu() for x in model.l1_dist]
    l2_mean = [x[0].cpu() for x in model.l2_dist]
    l2_std = [x[1].cpu() for x in model.l2_dist]

    return l1_mean, l1_std, l2_mean, l2_std, loss_arr, model_type.value
Example #25
0
class runManager():
    def __init__(self):

        # 记录每个epoch的参数
        self.epoch_count = 0  # epoch的次数
        self.epoch_loss = 0  # 每次epoch的loss
        self.epoch_num_correct = 0  # 每次epoch正确的个数
        self.epoch_start_time = None  # epoch的起始时间

        # 记录每次运行(不同的超参数背景)
        self.run_params = None  # 超参数的数值
        self.run_count = 0  # 第几次运行,跟batch_size有关
        self.run_data = []  # 每次epoch对应的超参数的数值以及计算出的loss等
        self.run_start_time = None  # 每次运行的起始时间

        self.network = None  # 网络
        self.loader = None  # 数据
        self.tb = None  # tensorboard的写入

    # 每次运行开始需要进行的操作,需要传入一个网络和数据以及必要的超参数,放在RunBilder里面管理
    def begin_run(self, run, network, loader):
        # 起始时间
        self.run_start_time = time.time()
        # 记录此次运行的超参数
        self.run_params = run
        # 记录运行的次数
        self.run_count += 1
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')
        # 写在tensorboard里面
        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)
        self.tb.add_image('images', grid)
        self.tb.add_graph(self.network,
                          images.to(getattr(run, 'device', 'cpu')))

    # 每次运行结束时需要进行的操作
    def end_run(self):
        # 关闭tensorboard的写操作
        self.tb.close()
        # 将epoch的次数重新归零
        self.epoch_count = 0
        # 每次epoch开始时需要进行的操作

    def begin_epoch(self):
        # 记录起始时间
        self.epoch_start_time = time.time()
        # 记录epoch的次数
        self.epoch_count += 1
        # 将epoch的loss重新归零
        self.epoch_loss = 0
        # 将epoch的正确个数重新归零
        self.epoch_num_correct = 0

    # 每次epoch结束时需要进行的操作
    def end_epoch(self):
        # 计算每次epoch完成所用的时间
        epoch_duration = time.time() - self.epoch_start_time
        # 计算每次运行(所有epoch)所用时间,这里需要注意,这里其实是在对epoch的时间经行累加
        run_duration = time.time() - self.run_start_time
        # 计算正确率
        loss = self.epoch_loss
        accuracy = self.epoch_num_correct / len(self.loader.dataset)
        # tensorboard写入数据
        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)
        # tensorboard写入数据
        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            # self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)
        # 将结果用表格的形式可视化,每一次epoch是最小单位,所以应该在这里可视化
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results['loss'] = loss
        results["accuracy"] = accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for k, v in self.run_params._asdict().items():
            results[k] = v
        self.run_data.append(results)
        print('runs: ' + "%d" % results["run"] + ', ' + 'epoch: ' +
              "%d" % results["epoch"] + ', ' + 'loss: ' +
              "%d" % results["loss"] + ', ' + 'accuracy: ' +
              "%f" % results["accuracy"])
        '''
        df = pd.DataFrame.from_dict(self.run_data, orient = 'columns')
        clear_output(wait=True)
        display(df)
        '''

    # 计算loss的方法,batch[0].shape[0]其实就是batch_size
    def track_loss(self, loss, batch):
        self.epoch_loss += loss.item() * batch[0].shape[0]

    # 计算正确个数的方法的方法
    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)

    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    # 将结果(表格)分别存为excel.csv和json格式
    def save(self, fileName):
        pd.DataFrame.from_dict(self.run_data,
                               orient='columns').to_csv(f'{fileName}.csv')

        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)
Example #26
0
f_net.train()
for epoch in range(epochs):

    running_loss = 0.0

    for i, data in enumerate(train_loader):

        inputs, labels = data
        optimizer.zero_grad()
        if is_stochastic:
            loss = f_net(inputs, labels)
        else:
            outputs = f_net(inputs)
            loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # lr_scheduler.step()
    writer.add_scalar('Loss', running_loss, epoch)
    for name, weight in f_net.named_parameters():
        writer.add_histogram(name, weight, epoch)

    if epoch % 10 == 0:
        print("Epoch: ", epoch, "Running loss: ", running_loss)

print('Finished Training')

writer.close()
torch.save(f_net.state_dict(), PATH_f_net)
Example #27
0
def train_network():
    print('')
    print('')
    # Start measuring time - to evaluate performance of the training function
    start = timeit.default_timer()

    # Set seeds
    set_seed(args)

    # Make folders if not yet exist
    try:
        os.makedirs('save')
    except FileExistsError:
        pass

    # Save relevant arguments from a and set hardcoded arguments
    lr = args.lr  # learning rate
    batch_size = args.batch_size  # Mini-batch size
    num_epochs = args.num_epochs  # Number of epochs to train the network
    seq_len = args.seq_len

    # Network architecture:
    rnn_name = args.rnn_name
    inputs_list = args.inputs_list
    outputs_list = args.outputs_list

    load_rnn = args.load_rnn  # If specified this is the name of pretrained RNN which should be loaded
    path_save = args.path_save

    # Create rnn instance and update lists of input, outputs and its name (if pretraind net loaded)
    net, rnn_name, inputs_list, outputs_list \
        = create_rnn_instance(rnn_name, inputs_list, outputs_list, load_rnn, path_save, device)

    # Create log for this RNN and determine its full name
    rnn_full_name = create_log_file(rnn_name, inputs_list, outputs_list, path_save)
    net.rnn_full_name = rnn_full_name

    ########################################################
    # Create Dataset
    ########################################################

    train_dfs, _ = load_data(args, args.train_file_name)

    normalization_info =  calculate_normalization_info(train_dfs, args.path_save, rnn_full_name)

    test_dfs, time_axes_dev = load_data(args, args.val_file_name)

    train_dfs_norm = normalize_df(train_dfs, normalization_info)
    test_dfs_norm = normalize_df(test_dfs, normalization_info)

    del train_dfs, test_dfs

    train_set = Dataset(train_dfs_norm, args)
    dev_set = Dataset(test_dfs_norm, args, time_axes=time_axes_dev)
    print('Number of samples in training set: {}'.format(train_set.number_of_samples))
    print('The training sets sizes are: {}'.format(train_set.df_lengths))
    print('Number of samples in validation set: {}'.format(dev_set.number_of_samples))
    print('')


    plot_results(net=net, args=args, dataset=dev_set, seq_len=1024,
                 comment='This is the network at the beginning of the training',
                 inputs_list=inputs_list, outputs_list=outputs_list,
                 save=True,
                 closed_loop_enabled=True)

    # Create PyTorch dataloaders for train and dev set
    train_generator = data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True,
                                      num_workers=args.num_workers)
    dev_generator = data.DataLoader(dataset=dev_set, batch_size=512, shuffle=False, num_workers=args.num_workers)

    # Print parameter count
    print_parameter_count(net)  # Seems not to function well

    # Select Optimizer
    optimizer = optim.Adam(net.parameters(), amsgrad=True, lr=lr)

    # TODO: Verify if scheduler is working. Try tweaking parameters of below scheduler and try cyclic lr scheduler

    # scheduler = lr_scheduler.CyclicLR(optimizer, base_lr=lr, max_lr=0.1)
    # scheduler = lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.5)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min',patience=1, verbose=True)

    # Select Loss Function
    criterion = nn.MSELoss()  # Mean square error loss function
    '''
    Init Tensorboard
    '''
    comment = f' batch_size={batch_size} lr={lr} seq_len={seq_len}'
    tb = SummaryWriter(comment=comment)
    ########################################################
    # Training
    ########################################################
    print("Starting training...")
    print('')
    time.sleep(0.001)

    # Create dictionary to store training history
    dict_history = {}
    dict_history['epoch'] = []
    dict_history['time'] = []
    dict_history['lr'] = []
    dict_history['train_loss'] = []
    dict_history['dev_loss'] = []
    dict_history['dev_gain'] = []
    dict_history['test_loss'] = []
    dev_gain = 1

    # The epoch_saved variable will indicate from which epoch is the last RNN model,
    # which was good enough to be saved
    epoch_saved = -1
    for epoch in range(num_epochs):

        ###########################################################################################################
        # Training - Iterate batches
        ###########################################################################################################
        # Set RNN in training mode
        net = net.train()
        # Define variables accumulating training loss and counting training batchs
        train_loss = 0
        train_batches = 0

        # Iterate training over available batches
        # tqdm() is just a function which displays the progress bar
        # Otherwise the line below is the same as "for batch, labels in train_generator:"
        for batch, labels in tqdm(train_generator):  # Iterate through batches

            # Reset the network (internal states of hidden layers and output history not the weights!)
            net.reset()

            # Further modifying the input and output form to fit RNN requirements
            # If GPU available we send tensors to GPU (cuda)
            if torch.cuda.is_available():
                batch = batch.float().cuda().transpose(0, 1)
                labels = labels.float().cuda()
            else:
                batch = batch.float().transpose(0, 1)
                labels = labels.float()

            # # Reset memory of gradients
            # optimizer.zero_grad()

            # Warm-up (open loop prediction) to settle the internal state of RNN hidden layers
            net(rnn_input=batch[:args.warm_up_len, :, :])

            # Reset memory of gradients
            optimizer.zero_grad()

            # Forward propagation - These are the results from which we calculate the update to RNN weights
            # GRU Input size must be (seq_len, batch, input_size)
            net(rnn_input=batch[args.warm_up_len:, :, :])
            out = net.return_outputs_history()

            # Get loss
            loss = criterion(out[:, args.warm_up_len:, :],
                             labels[:, args.warm_up_len:, :])

            # Backward propagation
            loss.backward()

            # Gradient clipping - prevent gradient from exploding
            torch.nn.utils.clip_grad_norm_(net.parameters(), 100)

            # Update parameters
            optimizer.step()
            # scheduler.step()
            # Update variables for loss calculation
            batch_loss = loss.detach()
            train_loss += batch_loss  # Accumulate loss
            train_batches += 1  # Accumulate count so we can calculate mean later

        ###########################################################################################################
        # Validation - Iterate batches
        ###########################################################################################################

        # Set the network in evaluation mode
        net = net.eval()

        # Define variables accumulating evaluation loss and counting evaluation batches
        dev_loss = 0
        dev_batches = 0

        for (batch, labels) in tqdm(dev_generator):

            # Reset the network (internal states of hidden layers and output history not the weights!)
            net.reset()

            # Further modifying the input and output form to fit RNN requirements
            # If GPU available we send tensors to GPU (cuda)
            if torch.cuda.is_available():
                batch = batch.float().cuda().transpose(0, 1)
                labels = labels.float().cuda()
            else:
                batch = batch.float().transpose(0, 1)
                labels = labels.float()

            # Warm-up (open loop prediction) to settle the internal state of RNN hidden layers
            net(rnn_input=batch)
            out = net.return_outputs_history()


            # Get loss
            # For evaluation we always calculate loss over the whole maximal prediction period
            # This allow us to compare RNN models from different epochs
            loss = criterion(out[:, args.warm_up_len: args.seq_len],
                             labels[:, args.warm_up_len: args.seq_len])

            # Update variables for loss calculation
            batch_loss = loss.detach()
            dev_loss += batch_loss  # Accumulate loss
            dev_batches += 1  # Accumulate count so we can calculate mean later

        # Reset the network (internal states of hidden layers and output history not the weights!)
        net.reset()
        # Get current learning rate
        # TODO(Fixed. It does changes now): I think now the learning rate do not change during traing, or it is not a right way to get this info.

        for param_group in optimizer.param_groups:
            lr_curr = param_group['lr']

        scheduler.step(dev_loss)
        '''
        Add data for tensorboard
        TODO : Add network graph and I/O to tensorboard
        '''
        # tb.add_graph(net)
        tb.add_scalar('Train Loss', train_loss / train_batches, epoch)
        tb.add_scalar('Dev Loss', dev_loss / dev_batches, epoch)

        # Add the first sample of batch to tensorboard. Prediction is represented by Dotted line
        # TODO: Concatenate such graphs. But they are not continous
        # for i in range(labels.shape[2]):
        #     time_label = np.arange(0, labels.shape[1], 1)
        #     time_out = np.arange(0, out.shape[1], 1)
        #     true_data = labels[1, :, i]
        #     predicted_data = out[1, :, i]
        #     fig_tb = plt.figure(5)
        #     plt.plot(time_label, true_data.detach().cpu())
        #     plt.plot(time_out, predicted_data.detach().cpu(), linestyle='dashed')
        #     tb.add_figure(tag=str(a.outputs_list[i]), figure=fig_tb, global_step=epoch)

        for name, param in net.named_parameters():
            tb.add_histogram(name, param, epoch)
            tb.add_histogram(f'{name}.grad', param.grad, epoch)
        tb.close()

        # Write the summary information about the training for the just completed epoch to a dictionary

        dict_history['epoch'].append(epoch)
        dict_history['lr'].append(lr_curr)
        dict_history['train_loss'].append(
            train_loss.detach().cpu().numpy() / train_batches / (args.seq_len - args.warm_up_len))
        dict_history['dev_loss'].append(
            dev_loss.detach().cpu().numpy() / dev_batches / (args.seq_len - args.warm_up_len))

        # Get relative loss gain for network evaluation
        if epoch >= 1:
            dev_gain = (dict_history['dev_loss'][epoch - 1] - dict_history['dev_loss'][epoch]) / \
                       dict_history['dev_loss'][epoch - 1]
        dict_history['dev_gain'].append(dev_gain)

        # Print the summary information about the training for the just completed epoch
        print('\nEpoch: %3d of %3d | '
              'LR: %1.5f | '
              'Train-L: %6.4f | '
              'Val-L: %6.4f | '
              'Val-Gain: %3.2f |' % (dict_history['epoch'][epoch], num_epochs - 1,
                                     dict_history['lr'][epoch],
                                     dict_history['train_loss'][epoch],
                                     dict_history['dev_loss'][epoch],
                                     dict_history['dev_gain'][epoch] * 100))
        print('')

        # Save the best model with the lowest dev loss
        # Always save the model from epoch 0
        # TODO: this is a bug: you should only save the model from epoch 0 if there is no pretraind network
        if epoch == 0:
            min_dev_loss = dev_loss
        # If current loss smaller equal than minimal till now achieved loss,
        # save the current RNN model and save its loss as minimal ever achieved
        if dev_loss <= min_dev_loss:
            epoch_saved = epoch
            min_dev_loss = dev_loss
            torch.save(net.state_dict(), args.path_save + rnn_full_name + '.pt', _use_new_zipfile_serialization=False)
            print('>>> saving best model from epoch {}'.format(epoch))
            print('')

            plot_string = 'This is the network after {} training epoch'.format(epoch + 1)
            plot_results(net=net, args=args, dataset=dev_set, seq_len=1024,
                         comment=plot_string,
                         inputs_list=inputs_list, outputs_list=outputs_list, save=True,
                         closed_loop_enabled=True)
        else:
            print('>>> We keep model from epoch {}'.format(epoch_saved))
            print('')

        # Evaluate the performance of the current network
        # by checking its predictions on a randomly generated CartPole experiment
        # open_loop_prediction_experiment(net, a, val_file)

    # When finished the training print the final message
    print("Training Completed...                                               ")
    print(" ")

    # Calculate the total time it took to run the function
    stop = timeit.default_timer()
    total_time = stop - start

    # Return the total time it took to run the function
    return total_time
class FlowTrainer(object):
    def __init__(self):
        super(FlowTrainer, self).__init__()
        # not the best model...
        self.model = PyramidUNet()

        self.epoch = 1000
        self.dataloader = SintelLoader()
        self.gpu_ids = GPUS_LIST

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
        self.scheduler = CosineAnnealingLR(self.optimizer, len(self.dataloader.train()))
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.writer = SummaryWriter()
        self.global_step = 0
        self.tripletloss = torch.nn.TripletMarginLoss()
        self.load_model_path = "./archive/best.pth"
        self.stat_cache = None

    def initialize(self):
        self.model.to(self.device)
        self.model = torch.nn.DataParallel(self.model, device_ids=self.gpu_ids)
        if self.load_model_path:
            # LOAD MODEL WEIGHTS HERE
            if os.path.exists(self.load_model_path):
                self.load_old_best()
        self.initialized = True

    def savemodel(self, metrics):
        import json
        with open('./archive/metrics.txt','w') as f:
            json.dump(metrics,f)
        torch.save(self.model.module.state_dict(), self.load_model_path)

    def warpframes(self, ff, fb, frame):
        ff_ = self.warper(ff, frame, 'ff')
        fb_ = self.warper(fb, frame, 'fb')
        warpframe = (ff_, fb_)
        occlusion = self.occwarper(ff, fb)
        return occlusion, warpframe

    #     def warpocclusion(self, ff, fb):
    #         return self.occwarper(ff,fb)

    def train(self, nb_epoch):
        trainstream = tqdm(self.dataloader.train())
        self.avg_loss = AverageMeter()
        self.avg_epe = AverageMeter()
        self.model.train()
        for i, data in enumerate(trainstream):
            self.global_step += 1
            trainstream.set_description('TRAINING')

            # GET X and Frame 2
            # wdt = data['displacement'].to(self.device)
            frame = data['frame'].to(self.device)
            flow = data['flow'].cpu()
            # frame.requires_grad = True
            flow.requires_grad = False
            """
            NOTE : THIS MUST BE ADJUSTED AT DATA LOADER SIDE 
            torch.Size([1, 2, 9, 436, 1024])    -> finalflow size
            torch.Size([1, 2, 9, 108, 256])     -> pyraflow1 size
            torch.Size([1, 2, 9, 54, 128])      -> pyraflow2 size
            torch.Size([1, 2, 9, 27, 64])       -> pyraflow3 size
            """
            pyra1_frame = data['pyra1_frame'].to(self.device)
            # pyra1_frame.requires_grad = True
            pyra2_frame = data['pyra2_frame'].to(self.device)
            # pyra2_frame.requires_grad = True
            laten_frame = data['laten_frame'].to(self.device)
            # laten_frame.requires_grad = True
            self.optimizer.zero_grad()
            # forward
            with torch.set_grad_enabled(True):
                finalflow, pyraflow1, pyraflow2, latenflow = self.model(frame)

                # pyra1_frame = F.interpolate(frame, size = (108, 256))
                # pyra2_frame = F.interpolate(frame, size = (54, 128))
                # laten_frame = F.interpolate(frame, size=(27, 64))

                occlu_final, frame_final = self.warpframes(*finalflow, frame)
                occlu_pyra1, frame_pyra1 = self.warpframes(*pyraflow1, pyra1_frame)
                occlu_pyra2, frame_pyra2 = self.warpframes(*pyraflow2, pyra2_frame)
                occlu_laten, frame_laten = self.warpframes(*latenflow, laten_frame)

                # print(occlu_final[0].shape)

                cost_final = self.getcost(*frame_final, *occlu_final, frame)
                cost_pyra1 = self.getcost(*frame_pyra1, *occlu_pyra1, pyra1_frame)
                cost_pyra2 = self.getcost(*frame_pyra2, *occlu_pyra2, pyra2_frame)
                cost_laten = self.getcost(*frame_laten, *occlu_laten, laten_frame)

                eper_final = self.epe(finalflow[1].cpu().detach(), flow.cpu().detach())

                loss = cost_final + cost_pyra1 + cost_pyra2 + cost_laten

                self.avg_loss.update(loss.item(), i + 1)
                self.avg_epe.update(eper_final.item(), i + 1)

                loss.backward()

                self.optimizer.step()

                self.writer.add_scalar('Loss/train',
                                       self.avg_loss.avg, self.global_step)

                self.writer.add_scalar('EPE/train',
                                       self.avg_epe.avg, self.global_step)

                trainstream.set_postfix({'epoch': nb_epoch,
                                         'loss': self.avg_loss.avg,
                                         'epe': self.avg_epe.avg})
        self.scheduler.step(loss)
        trainstream.close()

        fb_frame_final = frame_final[1]
        fb_final = finalflow[1]
        fb_occlu_final = occlu_final[1]

        self.writer.add_histogram('REAL/flow_u', flow[0,0].view(-1), nb_epoch)
        self.writer.add_histogram('REAL/flow_v', flow[0,1].view(-1), nb_epoch)

        self.writer.add_histogram('PRED/flow_u_ff', finalflow[0][0,0].view(-1), nb_epoch)
        self.writer.add_histogram('PRED/flow_v_ff', finalflow[0][0,1].view(-1), nb_epoch)

        self.writer.add_histogram('PRED/flow_u_fb', finalflow[1][0,0].view(-1), nb_epoch)
        self.writer.add_histogram('PRED/flow_v_fb', finalflow[1][0,1].view(-1), nb_epoch)

        self.writer.add_histogram('REAL/occ',data['occlusion'][0].view(-1),nb_epoch)

        self.writer.add_histogram('PRED/occ_ff',occlu_final[0][0].view(-1),nb_epoch)
        self.writer.add_histogram('PRED/occ_fb',occlu_final[1][0].view(-1),nb_epoch)

        return self.train_epoch_end({'TRloss': self.avg_loss.avg,
                                     'epoch': nb_epoch,
                                     'pred_frame': fb_frame_final[0:4],
                                     'gt_frame': frame[0:4,:3],
                                     'pred_flow': flow2rgb(fb_final[0:4], False),
                                     'gt_flow': flow2rgb(flow[0:4],False),
                                     'pred_occ': 1. - fb_occlu_final[0:4],
                                     'gt_occ': data['occlusion'][0:4]})

    def train_epoch_end(self, metrics):
        self.model.eval()
        with torch.no_grad():
            pred_frame = metrics.get('pred_frame')
            gt_frame = metrics.get('gt_frame')
            pred_flow = metrics.get('pred_flow')
            gt_flow = metrics.get('gt_flow')
            pred_occ = replicatechannel(metrics.get('pred_occ'))
            gt_occ = replicatechannel(metrics.get('gt_occ'))

            data = torch.cat([pred_frame.cuda(), gt_frame.cuda(), pred_flow.cuda(), gt_flow.cuda(), pred_occ.cuda(), gt_occ.cuda()],0)

            data = data.cpu()

            grid = make_grid(data, nrow=4)
            grid = ToTensor()((ToPILImage()(grid)).resize((4106//6,2630//4)))
            self.writer.add_images('TRAIN/Results', grid.unsqueeze(0), metrics.get('n_batch'))
        self.val(metrics.get('epoch'))

    def val(self, nb_epoch):
        self.model.eval()
        # if self.val_loader is None: return self.test()
        # DO VAL STUFF HERE
        valstream = tqdm(self.dataloader.val())
        self.avg_loss = AverageMeter()
        self.avg_epe = AverageMeter()
        valstream.set_description('VALIDATING')
        with torch.no_grad():
            for i, data in enumerate(valstream):
                frame = data['frame'].to(self.device)
                flow = data['flow'].cpu()
                finalflow = self.model(frame)
                occlu_final, frame_final = self.warpframes(*finalflow, frame)
                loss = self.getcost(*frame_final, *occlu_final, frame)
                eper_final = self.epe(flow.cpu().detach(), finalflow[1].cpu().detach())
                self.avg_loss.update(loss.item(), i + 1)
                self.avg_epe.update(eper_final.item(), i + 1)

        self.writer.add_scalar('Loss/val',
                               self.avg_loss.avg, self.global_step)

        self.writer.add_scalar('EPE/val',
                               self.avg_epe.avg, self.global_step)

        fb_frame_final = frame_final[1]
        fb_final = finalflow[1]
        fb_occlu_final = occlu_final[1]

        valstream.close()

        self.val_end({'VLloss': self.avg_loss.avg,
                      'VLepe':self.avg_epe.avg,
                      'epoch': nb_epoch,
                      'pred_frame': fb_frame_final[0:4],
                      'gt_frame': frame[0:4,:3],
                      'pred_flow': flow2rgb(fb_final[0:4], False),
                      'gt_flow': flow2rgb(flow[0:4],False),
                      'pred_occ': 1. - fb_occlu_final[0:4],
                      'gt_occ': data['occlusion'][0:4]})

    def val_end(self, metrics):
        """WRITE STAT FIRST"""
        if self.stat_cache is None:
            self.stat_cache = {'VLloss': metrics.get('VLloss'),
                            'VLepe': metrics.get('VLepe')}
            self.savemodel({'VLloss': metrics.get('VLloss'),
                            'VLepe': metrics.get('VLepe')})
        else:
            if self.stat_cache.get('VLloss') < metrics.get('VLloss'):
                self.stat_cache.update({'VLloss': metrics.get('VLloss'),
                                'VLepe': metrics.get('VLepe')})
                self.savemodel(self.stat_cache)
            else:
                self.load_old_best()

        self.model.eval()
        with torch.no_grad():
            pred_frame = metrics.get('pred_frame').cpu()
            gt_frame = metrics.get('gt_frame').cpu()
            pred_flow = metrics.get('pred_flow').cpu()
            gt_flow = metrics.get('gt_flow').cpu()
            pred_occ = replicatechannel(metrics.get('pred_occ')).cpu()
            gt_occ = replicatechannel(metrics.get('gt_occ')).cpu()
            data = torch.cat([pred_frame, gt_frame, pred_flow, gt_flow, pred_occ, gt_occ], 0).cpu()
            grid = make_grid(data, nrow=3)
            grid = ToTensor()((ToPILImage()(grid)).resize((4106 // 6, 2630 // 4)))
            self.writer.add_images('VAL/Results', grid.unsqueeze(0), metrics.get('n_batch'))
        # self.test(metrics.get('epoch'))

    def load_old_best(self):
        import json
        with open('./archive/metrics.txt', 'r') as f:
            self.stat_cache = json.load(f)
        self.model.module.load_state_dict(torch.load(self.load_model_path))

    def test(self, nb_epoch):
        self.model.eval()
        teststream = tqdm(self.dataloader.test())
        self.avg_loss = AverageMeter()
        teststream.set_description('TESTING')
        with torch.no_grad():
            for i, data in enumerate(teststream):
                frame = data['frame']
                finalflow = self.model(frame)

                occlu_final, frame_final = self.warpframes(*finalflow, frame)
                loss = self.getcost(*frame_final, *occlu_final, frame)

                self.avg_loss.update(loss.item(), i + 1)

        self.writer.add_scalar('Loss/test',
                               self.avg_loss.avg, self.global_step)

        fb_frame_final = frame_final[1]
        fb_final = finalflow[1]
        fb_occlu_final = occlu_final[1]

        teststream.close()

        self.test_end({'VLloss': self.avg_loss.avg,
                       'epoch': nb_epoch,
                       'pred_frame': fb_frame_final[0, :, 0:4, :].permute(1, 0, 2, 3),
                       'gt_frame': frame[0, :, 0:4, :].permute(1, 0, 2, 3),
                       'pred_flow': flow2rgb(fb_final[0, :, 0:4, :].permute(1, 0, 2, 3),False),
                       'pred_occ': 1. - fb_occlu_final[0, :, 0:4, :].permute(1, 0, 2, 3), })

    def test_end(self, metrics):
        self.model.eval()
        with torch.no_grad():
            pred_frame = metrics.get('pred_frame').cpu()
            gt_frame = metrics.get('gt_frame').cpu()
            pred_flow = metrics.get('pred_flow').cpu()
            pred_occ = replicatechannel(metrics.get('pred_occ')).cpu()
            data = torch.stack([pred_frame, gt_frame, pred_flow, pred_occ], 0)
            data = data.reshape(-1, 3, data.size(3), data.size(4)).cpu()
            grid = make_grid(data, nrow=4)
            self.writer.add_images('Test/Results', grid.unsqueeze(0), metrics.get('n_batch'))

    def loggings(self, **metrics):
        pass

    def warper(self, flows, frames, mode='ff', scaled=True, nocuda=False):
        if mode == 'ff':
            dframe = frames[:,:3]  # given frame from 0 to n-1 predict frame 1 to n
        elif mode == 'fb':
            dframe = frames[:, 3:]  # given frame from 1 to n predict frame 0 to n-1
        else:
            raise Exception("Mode must be flow-forwad 'ff' or flow-backward 'fb'")
        warped = warper(flows.cuda(), dframe.cuda(), scaled=True, nocuda=nocuda).cuda()
        return warped

    def occwarper(self, ff, fb):
        ff_occ, fb_occ = computeocclusion(ff, fb)
        return ff_occ, fb_occ

    def log_triplet_loss(self, anchor, positive, negative, maskp, maskn, q=1e-4):
        pos = torch.mul(torch.pow((torch.abs(anchor - positive) + 1e-2), q), maskp)
        neg = torch.mul(torch.pow((torch.abs(anchor - negative) + 1e-2), q), maskn)

        pos = pos.sum() / (maskp.sum() + 1e-10)
        neg = neg.sum() / (maskn.sum()+1e-10)

        # loss = torch.log(torch.exp(pos / (neg + 1e-10)))
        # loss = loss.sum() / (mask.sum() + 1e-10)
        return pos, neg

    def getcost(self, ff_frame, fb_frame, ff_occlu, fb_occlu, frame):

        ff_frame = ff_frame.cuda()
        fb_frame = fb_frame.cuda()
        frame = frame.cuda()

        ff_truth, fb_truth = frame[ :, 3:], frame[:, :3]

        ff_tloss = self.tripletloss(ff_truth, ff_frame, fb_frame)
        fb_tloss = self.tripletloss(ff_truth, ff_frame, fb_frame)

        f_ploss, b_ploss = self.log_triplet_loss(ff_truth, ff_frame, fb_frame, ff_occlu, fb_occlu)

        total = f_ploss + b_ploss + ff_tloss + fb_tloss

        # total = ff_tloss + fb_tloss

        return total

    def epe(self, source, target):
        with torch.no_grad():
            source = source.cpu().detach()
            target = target.cpu().detach()
            # from termcolor import colored
            # print(colored(f'{source.shape, target.shape, source.max(), target.max()}','red'))
            B, C, H, W = source.size()
            diff = (source - target).reshape(-1, C * H * W)
            return torch.norm(diff, p=2, dim=1).mean()

    def run(self):
        self.initialize()
        for i in range(self.epoch):
            self.train(i)
        self.writer.close()
Example #29
0
class network:
    def __init__(self, FLAGS):
        self.writer = SummaryWriter('output/s3dis_tensorboard')
        self.f_out = self.mkdir_log(FLAGS.log_dir)
        self.train_dataset = S3DIS('training')
        self.test_dataset = S3DIS('validation')
        self.train_dataloader = DataLoaderX(
            self.train_dataset,
            batch_size=FLAGS.batch_size,
            shuffle=True,
            num_workers=20,
            worker_init_fn=self.worker_init,
            collate_fn=self.train_dataset.collate_fn,
            pin_memory=True)
        self.test_dataloader = DataLoaderX(
            self.test_dataset,
            batch_size=FLAGS.batch_size,
            shuffle=True,
            num_workers=20,
            worker_init_fn=self.worker_init,
            collate_fn=self.test_dataset.collate_fn,
            pin_memory=True)
        print('train dataset length:{}'.format(len(self.train_dataset)))
        print('test dataset length:{}'.format(len(self.test_dataset)))
        print('train datalodaer length:{}'.format(len(self.train_dataloader)))
        print('test dataloader length:{}'.format(len(self.test_dataloader)))
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')
        self.config = ConfigS3DIS
        self.net = RandLANET('S3DIS', self.config)
        self.net.to(self.device)
        # torch.cuda.set_device(1)
        # if torch.cuda.device_count() > 1:
        #     log_out("Let's use multi GPUs!", self.f_out)
        #     device_ids=[1,2,3,4]
        #     self.net = nn.DataParallel(self.net, device_ids=[1,2,3,4])
        self.optimizer = optimizer.Adam(self.net.parameters(),
                                        lr=self.config.learning_rate)

        self.end_points = {}
        self.FLAGS = FLAGS

    def mkdir_log(self, out_path):
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        f_out = open(os.path.join(out_path, 'log_s3dis_train.txt'), 'a')
        return f_out

    def worker_init(self, worker_id):
        np.random.seed(np.random.get_state()[1][0] + worker_id)

    def adjust_learning_rate(self, epoch):
        lr = self.optimizer.param_groups[0]['lr']
        lr = lr * self.config.lr_decays[epoch]
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        self.writer.add_scalar('learning rate', lr, epoch)

    def train_one_epoch(self, epoch_count):
        self.stat_dict = {}  # collect statistics
        self.adjust_learning_rate(epoch_count)
        self.net.train()  # set model to training mode
        iou_calc = IoUCalculator(self.config)
        for batch_idx, batch_data in enumerate(self.train_dataloader):
            t_start = time.time()
            for key in batch_data:
                if type(batch_data[key]) is list:
                    for i in range(len(batch_data[key])):
                        batch_data[key][i] = batch_data[key][i].cuda()
                else:
                    batch_data[key] = batch_data[key].cuda()

            xyz = batch_data['xyz']  # (batch,N,3)
            neigh_idx = batch_data['neigh_idx']  # (batch,N,16)
            sub_idx = batch_data['sub_idx']  # (batch,N/4,16)
            interp_idx = batch_data['interp_idx']  # (batch,N,1)
            features = batch_data['features']  # (batch, 3, N)
            labels = batch_data['labels']  # (batch, N)
            input_inds = batch_data['input_inds']  # (batch, N)
            cloud_inds = batch_data['cloud_inds']  # (batch, 1)

            # Forward pass
            self.optimizer.zero_grad()
            self.out = self.net(xyz, neigh_idx, sub_idx, interp_idx, features,
                                labels, input_inds, cloud_inds)

            self.loss, self.end_points['valid_logits'], self.end_points[
                'valid_labels'] = compute_loss(self.out, labels, self.config)
            self.end_points['loss'] = self.loss
            # self.writer.add_graph(self.net, input_to_model=[xyz, neigh_idx, sub_idx, interp_idx, features, labels, input_inds, cloud_inds])
            self.writer.add_scalar(
                'training loss', self.loss,
                (epoch_count * len(self.train_dataloader) + batch_idx))

            self.loss.backward()
            self.optimizer.step()

            self.acc = compute_acc(self.end_points['valid_logits'],
                                   self.end_points['valid_labels'])
            self.end_points['acc'] = self.acc
            self.writer.add_scalar(
                'training accuracy', self.acc,
                (epoch_count * len(self.train_dataloader) + batch_idx))
            iou_calc.add_data(self.end_points['valid_logits'],
                              self.end_points['valid_labels'])

            for key in self.end_points:
                if 'loss' in key or 'acc' in key or 'iou' in key:
                    if key not in self.stat_dict:
                        self.stat_dict[key] = 0
                    self.stat_dict[key] += self.end_points[key].item()
            t_end = time.time()

            batch_interval = 10
            if (batch_idx + 1) % batch_interval == 0:
                log_out(
                    ' ----step %08d batch: %08d ----' %
                    (epoch_count * len(self.train_dataloader) + batch_idx + 1,
                     (batch_idx + 1)), self.f_out)
                for key in sorted(self.stat_dict.keys()):
                    log_out(
                        'mean %s: %f---%f ms' %
                        (key, self.stat_dict[key] / batch_interval, 1000 *
                         (t_end - t_start)), self.f_out)
                    self.writer.add_scalar(
                        'training mean {}'.format(key),
                        self.stat_dict[key] / batch_interval,
                        (epoch_count * len(self.train_dataloader) + batch_idx))
                    self.stat_dict[key] = 0

            for name, param in self.net.named_parameters():
                self.writer.add_histogram(
                    name + '_grad', param.grad,
                    (epoch_count * len(self.train_dataloader) + batch_idx))
                self.writer.add_histogram(
                    name + '_data', param,
                    (epoch_count * len(self.train_dataloader) + batch_idx))
        mean_iou, iou_list = iou_calc.compute_iou()
        self.writer.add_scalar('training mean iou', mean_iou,
                               (epoch_count * len(self.train_dataloader)))
        log_out('training mean IoU:{:.1f}'.format(mean_iou * 100), self.f_out)
        s = 'training IoU:'
        for iou_tmp in iou_list:
            s += '{:5.2f} '.format(100 * iou_tmp)
        log_out(s, self.f_out)
        self.writer.close()

    def evaluate_one_epoch(self, epoch_count):
        self.current_loss = None
        self.net.eval()  # set model to eval mode (for bn and dp)
        iou_calc = IoUCalculator(self.config)
        for batch_idx, batch_data in enumerate(self.test_dataloader):
            t_start = time.time()
            for key in batch_data:
                if type(batch_data[key]) is list:
                    for i in range(len(batch_data[key])):
                        batch_data[key][i] = batch_data[key][i].cuda()
                else:
                    batch_data[key] = batch_data[key].cuda()

            xyz = batch_data['xyz']  # (batch,N,3)
            neigh_idx = batch_data['neigh_idx']  # (batch,N,16)
            sub_idx = batch_data['sub_idx']  # (batch,N/4,16)
            interp_idx = batch_data['interp_idx']  # (batch,N,1)
            features = batch_data['features']  # (batch, 3, N)
            labels = batch_data['labels']  # (batch, N)
            input_inds = batch_data['input_inds']  # (batch, N)
            cloud_inds = batch_data['cloud_inds']  # (batch, 1)

            # Forward pass
            with torch.no_grad():
                self.out = self.net(xyz, neigh_idx, sub_idx, interp_idx,
                                    features, labels, input_inds, cloud_inds)

            self.loss, self.end_points['valid_logits'], self.end_points[
                'valid_labels'] = compute_loss(self.out, labels, self.config)
            self.end_points['loss'] = self.loss
            # self.writer.add_scalar('eval loss', self.loss, (epoch_count* len(self.test_dataloader) + batch_idx))
            self.acc = compute_acc(self.end_points['valid_logits'],
                                   self.end_points['valid_labels'])
            self.end_points['acc'] = self.acc
            # self.writer.add_scalar('eval acc', self.acc, (epoch_count* len(self.test_dataloader) + batch_idx))
            iou_calc.add_data(self.end_points['valid_logits'],
                              self.end_points['valid_labels'])

            # Accumulate statistics and print out
            for key in self.end_points:
                if 'loss' in key or 'acc' in key or 'iou' in key:
                    if key not in self.stat_dict:
                        self.stat_dict[key] = 0
                    self.stat_dict[key] += self.end_points[key].item()

            t_end = time.time()

            batch_interval = 10
            if (batch_idx + 1) % batch_interval == 0:
                log_out(
                    ' ----step %08d batch: %08d ----' %
                    (epoch_count * len(self.test_dataloader) + batch_idx + 1,
                     (batch_idx + 1)), self.f_out)

        for key in sorted(self.stat_dict.keys()):
            log_out(
                'mean %s: %f---%f ms' %
                (key, self.stat_dict[key] / batch_interval, 1000 *
                 (t_end - t_start)), self.f_out)
            self.writer.add_scalar(
                'eval mean {}'.format(key),
                self.stat_dict[key] / (float(batch_idx + 1)),
                (epoch_count * len(self.test_dataloader)))
        mean_iou, iou_list = iou_calc.compute_iou()
        self.writer.add_scalar('eval mean iou', mean_iou,
                               (epoch_count * len(self.test_dataloader)))
        log_out('eval mean IoU:{:.1f}'.format(mean_iou * 100), self.f_out)
        s = 'eval IoU:'
        for iou_tmp in iou_list:
            s += '{:5.2f} '.format(100 * iou_tmp)
        log_out(s, self.f_out)
        self.writer.close()

        current_loss = self.stat_dict['loss'] / (float(batch_idx + 1))
        return current_loss

    def train(self, start_epoch):
        loss = 0
        min_loss = 100
        current_loss = None
        for epoch in range(start_epoch, self.FLAGS.max_epoch):
            log_out('**************** EPOCH %03d ****************' % (epoch),
                    self.f_out)
            log_out(str(datetime.datetime.now()), self.f_out)
            np.random.seed()
            self.train_one_epoch(epoch)

            if epoch == 0 or epoch % 10 == 9:
                log_out('**** EVAL EPOCH %03d START****' % (epoch), self.f_out)
                current_loss = self.evaluate_one_epoch(epoch)
                log_out('**** EVAL EPOCH %03d END****' % (epoch), self.f_out)

            save_dict = {
                'epoch': epoch +
                1,  # after training one epoch, the start_epoch should be epoch+1
                'optimizer_state_dict': self.optimizer.state_dict(),
                'loss': loss,
            }

            try:
                save_dict['model_state_dict'] = self.net.module.state_dict()
            except:
                save_dict['model_state_dict'] = self.net.state_dict()

            torch.save(
                save_dict,
                os.path.join(self.FLAGS.log_dir, 's3dis_checkpoint.tar'))

    def run(self):
        it = -1
        start_epoch = 0
        checkpoint_path = self.FLAGS.checkpoint_path
        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
            checkpoint = torch.load(checkpoint_path)
            self.net.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            start_epoch = checkpoint['epoch']
            log_out(
                "-> loaded checkpoint %s (epoch: %d)" %
                (checkpoint_path, start_epoch), self.f_out)
        self.train(start_epoch)
Example #30
0
class Logger(object):
    def __init__(self,
                 log_dir,
                 save_tb=False,
                 log_frequency=10000,
                 action_repeat=1,
                 agent='drq'):
        self._log_dir = log_dir
        self._log_frequency = log_frequency
        self._action_repeat = action_repeat
        if save_tb:
            tb_dir = os.path.join(log_dir, 'tb')
            if os.path.exists(tb_dir):
                try:
                    shutil.rmtree(tb_dir)
                except:
                    print("logger.py warning: Unable to remove tb directory")
                    pass
            self._sw = SummaryWriter(tb_dir)
        else:
            self._sw = None
        # each agent has specific output format for training
        assert agent in AGENT_TRAIN_FORMAT
        train_format = COMMON_TRAIN_FORMAT + AGENT_TRAIN_FORMAT[agent]
        self._train_mg = MetersGroup(os.path.join(log_dir, 'train'),
                                     formating=train_format)
        self._eval_mg = MetersGroup(os.path.join(log_dir, 'eval'),
                                    formating=COMMON_EVAL_FORMAT)

    def _should_log(self, step, log_frequency):
        log_frequency = log_frequency or self._log_frequency
        return step % log_frequency == 0

    def _update_step(self, step):
        return step * self._action_repeat

    def _try_sw_log(self, key, value, step):
        step = self._update_step(step)
        if self._sw is not None:
            self._sw.add_scalar(key, value, step)

    def _try_sw_log_image(self, key, image, step):
        step = self._update_step(step)
        if self._sw is not None:
            assert image.dim() == 3
            grid = torchvision.utils.make_grid(image.unsqueeze(1))
            self._sw.add_image(key, grid, step)

    def _try_sw_log_video(self, key, frames, step):
        step = self._update_step(step)
        if self._sw is not None:
            frames = torch.from_numpy(np.array(frames))
            frames = frames.unsqueeze(0)
            self._sw.add_video(key, frames, step, fps=30)

    def _try_sw_log_histogram(self, key, histogram, step):
        step = self._update_step(step)
        if self._sw is not None:
            self._sw.add_histogram(key, histogram, step)

    def log(self, key, value, step, n=1, log_frequency=1):
        if not self._should_log(step, log_frequency):
            return
        assert key.startswith('train') or key.startswith('eval')
        if type(value) == torch.Tensor:
            value = value.item()
        self._try_sw_log(key, value / n, step)
        mg = self._train_mg if key.startswith('train') else self._eval_mg
        mg.log(key, value, n)

    def eval_log(self, key, value, step, n=1, log_frequency=1):
        """Same as self.log(), except we don't call self._should_log().
        In other words, we always log."""
        assert key.startswith('train') or key.startswith('eval')
        if type(value) == torch.Tensor:
            value = value.item()
        self._try_sw_log(key, value / n, step)
        mg = self._train_mg if key.startswith('train') else self._eval_mg
        mg.log(key, value, n)

    def test_log(self, key, value, step, n=1, log_frequency=1):
        """Just writes to TensorBoard. We handle CSV writing separately."""
        assert key.startswith('test')
        if type(value) == torch.Tensor:
            value = value.item()
        self._try_sw_log(key, value / n, step)

    def log_param(self, key, param, step, log_frequency=None):
        if not self._should_log(step, log_frequency):
            return
        self.log_histogram(key + '_w', param.weight.data, step)
        if hasattr(param.weight, 'grad') and param.weight.grad is not None:
            self.log_histogram(key + '_w_g', param.weight.grad.data, step)
        if hasattr(param, 'bias') and hasattr(param.bias, 'data'):
            self.log_histogram(key + '_b', param.bias.data, step)
            if hasattr(param.bias, 'grad') and param.bias.grad is not None:
                self.log_histogram(key + '_b_g', param.bias.grad.data, step)

    def log_image(self, key, image, step, log_frequency=None):
        if not self._should_log(step, log_frequency):
            return
        assert key.startswith('train') or key.startswith('eval')
        self._try_sw_log_image(key, image, step)

    def log_video(self, key, frames, step, log_frequency=None):
        if not self._should_log(step, log_frequency):
            return
        assert key.startswith('train') or key.startswith('eval')
        self._try_sw_log_video(key, frames, step)

    def log_histogram(self, key, histogram, step, log_frequency=None):
        if not self._should_log(step, log_frequency):
            return
        assert key.startswith('train') or key.startswith('eval')
        self._try_sw_log_histogram(key, histogram, step)

    def dump(self, step, save=True, ty=None):
        step = self._update_step(step)
        if ty is None:
            self._train_mg.dump(step, 'train', save)
            self._eval_mg.dump(step, 'eval', save)
        elif ty == 'eval':
            self._eval_mg.dump(step, 'eval', save)
        elif ty == 'train':
            self._train_mg.dump(step, 'train', save)
        else:
            raise f'invalid log type: {ty}'