def forward(self, data, label, pro, former_avg):
     avg_list = self.avg(data, label, pro)
     synchronize(self.num_process)
     if self.avg_choice:
         dist.all_reduce(avg_list, op=dist.ReduceOp.SUM)
         weighted_avg = avg_list[:self.dim] / avg_list[-1:]
     else:
         loss_value = avg_list[-1:].clone()
         dist.all_reduce(loss_value, op=dist.ReduceOp.MIN)
         if loss_value != avg_list[-1:]:
             avg_list[:self.dim] = torch.zeros(self.dim).double()
             avg_list[-1:] = 0.0
         else:
             avg_list[-1:] = 1.0
         synchronize(self.num_process)
         dist.all_reduce(avg_list, op=dist.ReduceOp.SUM)
         weighted_avg = avg_list[:self.dim] / avg_list[-1:]
     norm = torch.norm(weighted_avg - former_avg)
     M = weighted_avg.size(0)
     flag = bool(norm**2 / M <= self.tol)
     weighted_avg_list = weighted_avg.expand(self.num, -1)
     self.X = weighted_avg_list + (self.X - weighted_avg_list) * math.exp(-self.drift * self.timestep)
     if self.noise_choice:
         noise_term = torch.randn(self.num * self.num_process, self.dim)[self.rank * self.num : self.rank * self.num + self.num, ].double()
         # self.X += self.noise * math.sqrt(self.timestep) * (self.X - weighted_avg_list).mul(noise_term)
         bm = torch.randn(self.num * self.num_process, self.dim)[self.rank * self.num : self.rank * self.num + self.num, ].double()
         self.X += self.noise * math.sqrt(self.timestep) * bm
     flag_noise = False
     if flag:
         bm = torch.randn(self.num * self.num_process, self.dim)[self.rank * self.num : self.rank * self.num + self.num, ].double()
         self.X += self.noise * math.sqrt(self.timestep) * bm
         flag_noise = True
         if self.rank == 0:
             print("noise!")
     return weighted_avg, flag_noise
Example #2
0
def main():
    parser = argparse.ArgumentParser(description="RetinaNet")
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument("--start_epoch", type=int, default=1)
    parser.add_argument("--dist", action="store_true", default=False)

    args = parser.parse_args()
    if (args.dist):
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend="nccl", init_method="env://")
        utils.synchronize()

    train(args.dist, args.start_epoch, args.local_rank)
Example #3
0
def main():
    # os.environ["CUDA_VISIBLE_DEVICES"]="1"
    parser = argparse.ArgumentParser(description="ATSS")
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument("--start_epoch", type=int, default=1)
    parser.add_argument("--dist", action="store_true")

    args = parser.parse_args()
    if (args.dist):
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend="nccl", init_method="env://")
        utils.synchronize()

    train(args.dist, args.start_epoch, args.local_rank)
Example #4
0
def main():
    # os.environ["CUDA_VISIBLE_DEVICES"]="0"
    parser = argparse.ArgumentParser(description="FCOS")
    parser.add_argument("--local_rank", type=int, default=0)
    gpu_nums = torch.cuda.device_count()
    is_dist = gpu_nums > 1

    args = parser.parse_args()
    if (is_dist):
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend="nccl", init_method="env://")
        utils.synchronize()

    train(is_dist, args.local_rank)
Example #5
0
def main_dist(uid: str, **kwargs):
    """
    uid is a unique identifier for the experiment name
    Can be kept same as a previous run, by default will start executing
    from latest saved model
    **kwargs: allows arbit arguments of cfg to be changed
    """
    cfg = conf
    num_gpus = torch.cuda.device_count()
    cfg.num_gpus = num_gpus

    if num_gpus > 1:

        if 'local_rank' in kwargs:
            # We are doing distributed parallel
            cfg.do_dist = True
            torch.cuda.set_device(kwargs['local_rank'])
            torch.distributed.init_process_group(
                backend="nccl", init_method="env://"
            )
            synchronize()
        else:
            # We are doing data parallel
            cfg.do_dist = False

    # Update the config file depending on the command line args
    cfg = update_from_dict(cfg, kwargs, key_maps)

    # Freeze the cfg, can no longer be changed
    cfg.freeze()
    # print(cfg)
    # Initialize learner
    learn = learner_init(uid, cfg)
    # Train or Test
    if not (cfg.only_val or cfg.only_test):
        learn.fit(epochs=cfg.epochs, lr=cfg.lr)
    else:
        if cfg.only_val:
            learn.testing(learn.data.valid_dl)
        if cfg.only_test:
            learn.testing(learn.data.test_dl)
Example #6
0
def inference(model, data_loader, dataset_name, device='cuda', output_folder=None,
              expected_results=(), expected_results_sigma_tol=4):
    device = torch.device(device)
    num_devices = get_world_size()
    logger = logging.getLogger("RetinaNet.inference")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset)))
    total_timer = Timer()
    inference_timer = Timer()
    total_timer.tic()
    predictions = compute_on_dataset(model, data_loader, device, inference_timer)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = total_timer.toc()
    total_time_str = get_time_str(total_time)
    logger.info(
        "Total run time: {} ({} s / img per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )

    predictions = accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    if output_folder:
        torch.save(predictions, os.path.join(output_folder, "predictions.pth"))

    extra_args = dict(
        expected_results=expected_results,
        expected_results_sigma_tol=expected_results_sigma_tol,
    )

    return evaluate(dataset=dataset,
                    predictions=predictions,
                    output_folder=output_folder,
                    **extra_args)
Example #7
0
    input_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
    ])

    data_kwargs = {
        'base_size': args.base_size,
        'crop_size': args.crop_size,
        'transform': input_transform
    }

    val_dataset = get_segmentation_dataset(args.dataset,
                                           split=args.split,
                                           mode=args.mode,
                                           **data_kwargs)
    sampler = make_data_sampler(val_dataset, False, distributed)
    batch_sampler = data.BatchSampler(sampler=sampler,
                                      batch_size=args.batch_size,
                                      drop_last=False)
    val_data = data.DataLoader(val_dataset,
                               shuffle=False,
                               batch_sampler=batch_sampler,
                               num_workers=args.num_workers)
    metric = SegmentationMetric(val_dataset.num_class)

    metric = validate(model, val_data, metric, device)
    ptutil.synchronize()
    pixAcc, mIoU = ptutil.accumulate_metric(metric)
    if ptutil.is_main_process():
        print('pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU))
Example #8
0
File: base.py Project: JarvisLL/ACE
    def training(self):
        self.net.train()
        save_to_disk = ptutil.get_rank() == 0
        start_training_time = time.time()
        trained_time = 0
        mIoU = 0
        best_miou = 0
        tic = time.time()
        end = time.time()
        iteration, max_iter = 0, self.max_iter
        save_iter, eval_iter = self.per_iter * self.config.TRAIN.SAVE_EPOCH, self.per_iter * self.config.TRAIN.EVAL_EPOCHS
        self.logger.info("Start training, total epochs {:3d} = total iteration: {:6d}".format(self.config.TRAIN.EPOCHS, max_iter))
        for i, (image, target) in enumerate(self.train_loader):
            iteration += 1
            self.scheduler.step()
            self.optimizer.zero_grad()
            image, target = image.to(self.device,dtype=self.dtype), target.to(self.device)
            if self.config.DATASET.IMG_TRANSFORM == False:
                image = image.permute(0,3,1,2)
            outputs = self.net(image)
            loss_dict = self.criterion(outputs, target)
            loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            loss = sum(loss for loss in loss_dict.values())
            if self.config.TRAIN.MIXED_PRECISION:
                with amp.scale_loss(loss,self.optimizer) as scale_loss:
                    scale_loss.backward()
            else:
                loss.backward()

            self.optimizer.step()
            trained_time += time.time() - end
            end = time.time()
            if iteration % self.config.TRAIN.LOG_STEP == 0:
                eta_seconds = int((trained_time / iteration) * (max_iter - iteration))
                log_str = ["Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}"
                               .format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic,
                                       str(datetime.timedelta(seconds=eta_seconds))),
                           "total_loss: {:.3f}".format(losses_reduced.item())]
                log_str = ', '.join(log_str)
                self.logger.info(log_str)
                tic = time.time()
            if save_to_disk and iteration % save_iter == 0:
                model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_iter_{:06d}.pth"
                                          .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, iteration))
                ptutil.save_model(self.net,model_path,self.logger)
            if self.config.TRAIN.EVAL_EPOCHS > 0 and iteration % eval_iter == 0 and not iteration == max_iter:
                metrics = ptutil.validate(self.net,self.valid_loader,self.metric,self.device,self.config)
                ptutil.synchronize()
                pixAcc, mIoU = ptutil.accumulate_metric(metrics)
                if mIoU !=None and mIoU >= best_miou:
                    best_miou = mIoU
                    model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_best.pth"
                                          .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME))
                    ptutil.save_model(self.net,model_path,self.logger)
                if pixAcc is not None:
                    self.logger.info('pixAcc: {:.4f}, mIoU: {:.4f}'.format(pixAcc, mIoU))
                self.net.train()
        if save_to_disk:
            model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_iter_{:06d}.pth"
                                      .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, max_iter))
            ptutil.save_model(self.net,model_path,self.logger)
        total_training_time = int(time.time() - start_training_time)
        total_time_str = str(datetime.timedelta(seconds=total_training_time))
        self.logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter))
        # eval after training
        if not self.config.TRAIN.SKIP_EVAL:
            metrics = ptutil.validate(self.net,self.valid_loader,self.metric,self.device,self.config)
            ptutil.synchronize()
            pixAcc, mIoU = ptutil.accumulate_metric(metrics)
            if pixAcc is not None:
                self.logger.info('After training, pixAcc: {:.4f}, mIoU: {:.4f}'.format(pixAcc, mIoU))
Example #9
0
    semart_val_dataloader = DataLoader(dataset=semart_val,
                                       batch_size=args.batch_size,
                                       drop_last=False,
                                       num_workers=4)

    wpi_dataloader = DataLoader(dataset=wpi_data,
                                batch_size=args.batch_size,
                                drop_last=False,
                                num_workers=4)

    if int(os.environ["WORLD_SIZE"]) > 1:
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
    print("world size: {}".format(os.environ["WORLD_SIZE"]))
    print("rank: {}".format(args.local_rank))
    synchronize()

    if int(os.environ["WORLD_SIZE"]) > 1:
        combined_model = torch.nn.parallel.DistributedDataParallel(
            CombinedModel(len(vectorizer.vocabulary_),
                          device,
                          args.resnet,
                          l2_norm=True),
            device_ids=[args.local_rank],
            output_device=args.local_rank).cuda()
    else:
        combined_model = CombinedModel(len(vectorizer.vocabulary_),
                                       device,
                                       args.resnet,
                                       l2_norm=True).to(device)
Example #10
0
def main(rank, args):
    """
    Parameters
    ----------
    rank : int
        Subprocess id
    args : dict
        Configuration
    """
    if rank == 0:
        t1 = time.time()

    set_random_seed(args['seed'])
    # Remove the line below will result in problems for multiprocess
    torch.set_num_threads(1)

    # Setup dataset and data loader
    dataset = MoleculeDataset(args['dataset'],
                              args['order'], ['train', 'val'],
                              subset_id=rank,
                              n_subsets=args['num_processes'])

    # Note that currently the batch size for the loaders should only be 1.
    train_loader = DataLoader(dataset.train_set,
                              batch_size=args['batch_size'],
                              shuffle=True,
                              collate_fn=dataset.collate)
    val_loader = DataLoader(dataset.val_set,
                            batch_size=args['batch_size'],
                            shuffle=True,
                            collate_fn=dataset.collate)

    if rank == 0:
        try:
            from tensorboardX import SummaryWriter
            writer = SummaryWriter(args['log_dir'])
        except ImportError:
            print(
                'If you want to use tensorboard, install tensorboardX with pip.'
            )
            writer = None
        train_printer = Printer(args['nepochs'], len(dataset.train_set),
                                args['batch_size'], writer)
        val_printer = Printer(args['nepochs'], len(dataset.val_set),
                              args['batch_size'])
    else:
        val_printer = None

    # Initialize model
    model = DGMG(atom_types=dataset.atom_types,
                 bond_types=dataset.bond_types,
                 node_hidden_size=args['node_hidden_size'],
                 num_prop_rounds=args['num_propagation_rounds'],
                 dropout=args['dropout'])

    if args['num_processes'] == 1:
        from utils import Optimizer
        optimizer = Optimizer(args['lr'],
                              Adam(model.parameters(), lr=args['lr']))
    else:
        from utils import MultiProcessOptimizer
        optimizer = MultiProcessOptimizer(
            args['num_processes'], args['lr'],
            Adam(model.parameters(), lr=args['lr']))

    if rank == 0:
        t2 = time.time()
    best_val_prob = 0

    # Training
    for epoch in range(args['nepochs']):
        model.train()
        if rank == 0:
            print('Training')

        for i, data in enumerate(train_loader):
            log_prob = model(actions=data, compute_log_prob=True)
            prob = log_prob.detach().exp()

            loss_averaged = -log_prob
            prob_averaged = prob
            optimizer.backward_and_step(loss_averaged)
            if rank == 0:
                train_printer.update(epoch + 1, loss_averaged.item(),
                                     prob_averaged.item())

        synchronize(args['num_processes'])

        # Validation
        val_log_prob = evaluate(epoch, model, val_loader, val_printer)
        if args['num_processes'] > 1:
            dist.all_reduce(val_log_prob, op=dist.ReduceOp.SUM)
        val_log_prob /= args['num_processes']
        # Strictly speaking, the computation of probability here is different from what is
        # performed on the training set as we first take an average of log likelihood and then
        # take the exponentiation. By Jensen's inequality, the resulting value is then a
        # lower bound of the real probabilities.
        val_prob = (-val_log_prob).exp().item()
        val_log_prob = val_log_prob.item()
        if val_prob >= best_val_prob:
            if rank == 0:
                torch.save({'model_state_dict': model.state_dict()},
                           args['checkpoint_dir'])
                print(
                    'Old val prob {:.10f} | new val prob {:.10f} | model saved'
                    .format(best_val_prob, val_prob))
            best_val_prob = val_prob
        elif epoch >= args['warmup_epochs']:
            optimizer.decay_lr()

        if rank == 0:
            print('Validation')
            if writer is not None:
                writer.add_scalar('validation_log_prob', val_log_prob, epoch)
                writer.add_scalar('validation_prob', val_prob, epoch)
                writer.add_scalar('lr', optimizer.lr, epoch)
            print('Validation log prob {:.4f} | prob {:.10f}'.format(
                val_log_prob, val_prob))

        synchronize(args['num_processes'])

    if rank == 0:
        t3 = time.time()
        print('It took {} to setup.'.format(datetime.timedelta(seconds=t2 -
                                                               t1)))
        print('It took {} to finish training.'.format(
            datetime.timedelta(seconds=t3 - t2)))
        print(
            '--------------------------------------------------------------------------'
        )
        print('On average, an epoch takes {}.'.format(
            datetime.timedelta(seconds=(t3 - t2) / args['nepochs'])))
Example #11
0
def main(rank, dev_id, args):
    set_seed()
    # Remove the line below will result in problems for multiprocess
    if args['num_devices'] > 1:
        torch.set_num_threads(1)
    if dev_id == -1:
        args['device'] = torch.device('cpu')
    else:
        args['device'] = torch.device('cuda:{}'.format(dev_id))
        # Set current device
        torch.cuda.set_device(args['device'])

    train_set, val_set = load_dataset(args)
    get_center_subset(train_set, rank, args['num_devices'])
    train_loader = DataLoader(train_set, batch_size=args['batch_size'],
                              collate_fn=collate_center, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=args['batch_size'],
                            collate_fn=collate_center, shuffle=False)

    model = WLNReactionCenter(node_in_feats=args['node_in_feats'],
                              edge_in_feats=args['edge_in_feats'],
                              node_pair_in_feats=args['node_pair_in_feats'],
                              node_out_feats=args['node_out_feats'],
                              n_layers=args['n_layers'],
                              n_tasks=args['n_tasks']).to(args['device'])
    model.train()
    if rank == 0:
        print('# trainable parameters in the model: ', count_parameters(model))

    criterion = BCEWithLogitsLoss(reduction='sum')
    optimizer = Adam(model.parameters(), lr=args['lr'])
    if args['num_devices'] <= 1:
        from utils import Optimizer
        optimizer = Optimizer(model, args['lr'], optimizer, max_grad_norm=args['max_norm'])
    else:
        from utils import MultiProcessOptimizer
        optimizer = MultiProcessOptimizer(args['num_devices'], model, args['lr'],
                                          optimizer, max_grad_norm=args['max_norm'])

    total_iter = 0
    rank_iter = 0
    grad_norm_sum = 0
    loss_sum = 0
    dur = []

    for epoch in range(args['num_epochs']):
        t0 = time.time()
        for batch_id, batch_data in enumerate(train_loader):
            total_iter += args['num_devices']
            rank_iter += 1

            batch_reactions, batch_graph_edits, batch_mol_graphs, \
            batch_complete_graphs, batch_atom_pair_labels = batch_data
            labels = batch_atom_pair_labels.to(args['device'])
            pred, biased_pred = reaction_center_prediction(
                args['device'], model, batch_mol_graphs, batch_complete_graphs)
            loss = criterion(pred, labels) / len(batch_reactions)
            loss_sum += loss.cpu().detach().data.item()
            grad_norm_sum += optimizer.backward_and_step(loss)

            if rank_iter % args['print_every'] == 0 and rank == 0:
                progress = 'Epoch {:d}/{:d}, iter {:d}/{:d} | ' \
                           'loss {:.4f} | grad norm {:.4f}'.format(
                    epoch + 1, args['num_epochs'], batch_id + 1, len(train_loader),
                    loss_sum / args['print_every'], grad_norm_sum / args['print_every'])
                print(progress)
                grad_norm_sum = 0
                loss_sum = 0

            if total_iter % args['decay_every'] == 0:
                optimizer.decay_lr(args['lr_decay_factor'])
            if total_iter % args['decay_every'] == 0 and rank == 0:
                if epoch >= 1:
                    dur.append(time.time() - t0)
                    print('Training time per {:d} iterations: {:.4f}'.format(
                        rank_iter, np.mean(dur)))
                total_samples = total_iter * args['batch_size']
                prediction_summary = 'total samples {:d}, (epoch {:d}/{:d}, iter {:d}/{:d}) '.format(
                    total_samples, epoch + 1, args['num_epochs'], batch_id + 1, len(train_loader)) + \
                      reaction_center_final_eval(args, args['top_ks_val'], model, val_loader, easy=True)
                print(prediction_summary)
                with open(args['result_path'] + '/val_eval.txt', 'a') as f:
                    f.write(prediction_summary)
                torch.save({'model_state_dict': model.state_dict()},
                           args['result_path'] + '/model_{:d}.pkl'.format(total_samples))
                t0 = time.time()
                model.train()
        synchronize(args['num_devices'])
Example #12
0
def main(rank, args):
    set_random_seed(args['seed'])
    torch.set_num_threads(1)

    device = 'cuda' if (torch.cuda.is_available() and args['gpu']) else 'cpu'

    if rank == 0:
        print("Preparing data...")
    if args['dataset'] == "mnist":
        transform = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize((0.5, ), (1.0, ))])
        data_train = datasets.MNIST(root="./data_cache/",
                                    transform=transform,
                                    train=True,
                                    download=True)
        data_test = datasets.MNIST(root="./data_cache/",
                                   transform=transform,
                                   train=False)
    else:
        dataset = Data(args['dataset'], args['repeat_time'], args['seed'],
                       args['n_dim'], args['sparsity'], rank,
                       args['reg_fista'], args['iteration'],
                       args['tolerance_fista'])
    res_loss = torch.zeros(args['repeat_time']).double()
    res_acc = torch.zeros(args['repeat_time']).double()
    log_loss = list()
    log_loss_ave = list()
    log_acc = list()
    log_loss_train = list()
    log_loss_ave_train = list()
    log_acc_train = list()
    '''
        gisette: sparse logistic regression on the Gisette datasets,
        picture_reconstruction: optimizing the angles of the Radon Transformation of the picture of Lenna,
        compressive_sensing: the same numerical experiment as the one in New PIHT paper by Zhang Xiaoqun,
        mnist: numerical experiments with optimizing neural networks on the MNIST dataset,
        ras: numerical experiments with rastrigin function optimization problem
    '''
    if args['dataset'] == "gisette":
        # preparing data...
        data_valid, label_valid = dataset.data_valid, dataset.label_valid
        data_valid, label_valid = data_valid.to(device), label_valid.to(device)
        data_train, label_train = dataset.data, dataset.label
        data_train, label_train = data_train.to(device), label_train.to(device)
        feature_dim = dataset.feature_dim
        trainloader = DataLoader(dataset,
                                 batch_size=args['batch_loss'],
                                 shuffle=True)
        weighted_avg = torch.zeros(1, feature_dim + 1).double().to(device)

        # initialize CBO optimizer
        if args['num_processes'] == 1 or device == 'cuda':
            optimizer = CBO_optimizer(num=args['num_particle'],
                                      dim=feature_dim + 1,
                                      drift=args['drift'],
                                      noise=args['noise'],
                                      temp=args['temperature'],
                                      timestep=args['timestep'],
                                      tol=args['tolerance'],
                                      seed=args['seed'],
                                      batch_avg=args['batch_avg'],
                                      avg_choice=args['avg_choice'],
                                      noise_choice=args['noise_choice'],
                                      device=device,
                                      lam_reg=args['reg'],
                                      batch_loss=args['batch_loss'])
        else:
            optimizer = MultiprocessCBO(num_process=args['num_processes'],
                                        rank=rank,
                                        num=args['num_particle'],
                                        dim=feature_dim + 1,
                                        drift=args['drift'],
                                        noise=args['noise'],
                                        temp=args['temperature'],
                                        timestep=args['timestep'],
                                        tol=args['tolerance'],
                                        seed=args['seed'],
                                        batch_avg=args['batch_avg'],
                                        avg_choice=args['avg_choice'],
                                        noise_choice=args['noise_choice'],
                                        device=device,
                                        lam_reg=args['reg'],
                                        batch_loss=args['batch_loss'])

        # optimizing...
        if rank == 0:
            print("Training...")
        epo_flag = False
        time_start = time.time()
        for i in range(args['epoch']):
            for train_batch, batch in enumerate(trainloader):
                if rank == 0:
                    t1 = time.time()
                data, label = batch_loader(batch)
                data, label = data.to(device), label.to(device)
                former = copy.deepcopy(weighted_avg)
                weighted_avg, noise_flag = optimizer.forward(
                    data, label, args['problem'], weighted_avg)

                # synchronize(args['num_processes'])
                if rank == 0:
                    t2 = time.time()

                # noise_flag: whether to introduce BM in CBO
                # log recent results
                '''
                if noise_flag and rank == 0:
                    write_res('./results/train_loss_NCBO_logistic_log_loss_' + str(args['num_particle']) + '.npy', log_loss_train)
                    write_res('./results/train_loss_NCBO_logistic_log_acc_' + str(args['num_particle']) + '.npy', log_acc_train)
                    write_res('./results/loss_NCBO_logistic_log_loss_' + str(args['num_particle']) + '.npy', log_loss)
                    write_res('./results/loss_NCBO_ori_logistic_log_acc_' + str(args['num_particle']) + '.npy', log_acc)
                '''

                # validation
                u = weighted_avg[:-1]
                v = weighted_avg[-1:]
                pred_train = batch_logistic_class(data_train, u, v)
                pred_train_cnt = (pred_train == label_train).sum().double()
                loss_train = test_logistic_l0(data_train, label_train, u, v,
                                              args['reg'])
                log_loss_train.append(loss_train.item())
                log_acc_train.append(pred_train_cnt.item() /
                                     data_train.size(0))
                pred = batch_logistic_class(data_valid, u, v)
                pred_cnt = (pred == label_valid).sum().double()
                loss = test_logistic_l0(data_valid, label_valid, u, v,
                                        args['reg'])
                log_loss.append(loss.item())
                log_acc.append(pred_cnt.item() / data_valid.size(0))
                if rank == 0:
                    print(
                        "epoch: {:d} | iter: {:d} | loss: {:f} | validation precision: {:f}"
                        .format(i, train_batch, loss.item(),
                                pred_cnt.item() / data_valid.size(0)))

                synchronize(args['num_processes'])

                if rank == 0:
                    t3 = time.time()
                    print("training time: {:f}".format(t2 - t1))
                    print("test time: {:f}".format(t3 - t2))

                # check whether to reach the stopping criterion
                delta_error = torch.norm(weighted_avg - former)
                err = delta_error * delta_error / weighted_avg.size(0)
                if err <= args['tolerance_stop']:
                    # print(former_avg)
                    # average, variance = var(former_avg)
                    # print(np.linalg.norm(former_avg), average, variance)
                    if rank == 0:
                        print("Consensus!")
                    epo_flag = True
                    break
            if epo_flag:
                optimizer._initialize_particles()
                break
        time_end = time.time()
        if rank == 0:
            print("total time: {:f}".format(time_end - time_start))
            print(min(log_loss_train))
            print(max(log_acc_train))
            print(min(log_loss))
            print(max(log_acc))
            write_res(
                './results/train_loss_RCBO_logistic_log_loss_' +
                str(args['num_particle']) + '.npy', log_loss_train)
            write_res(
                './results/train_loss_RCBO_logistic_log_acc_' +
                str(args['num_particle']) + '.npy', log_acc_train)
            write_res(
                './results/loss_RCBO_logistic_log_loss_' +
                str(args['num_particle']) + '.npy', log_loss)
            write_res(
                './results/loss_RCBO_logistic_log_acc_' +
                str(args['num_particle']) + '.npy', log_acc)
            print("Results written!")
        sys.exit("break!")
    elif args['dataset'] == "picture_reconstruction":
        data, label, theta = dataset.data, dataset.label, dataset.theta
        weighted_avg = torch.zeros(theta.size(0)).double()

        # initialize CBO optimizer
        if args['num_processes'] == 1:
            optimizer = CBO_optimizer(num=args['num_particle'],
                                      dim=theta.size(0),
                                      drift=args['drift'],
                                      noise=args['noise'],
                                      temp=args['temperature'],
                                      timestep=args['timestep'],
                                      tol=args['tolerance'],
                                      seed=args['seed'],
                                      batch_avg=args['batch_avg'],
                                      avg_choice=args['avg_choice'],
                                      noise_choice=args['noise_choice'],
                                      device=device,
                                      lam_reg=args['reg'],
                                      batch_loss=args['batch_loss'])
        else:
            optimizer = MultiprocessCBO(num_process=args['num_processes'],
                                        rank=rank,
                                        num=args['num_particle'],
                                        dim=theta.size(0),
                                        drift=args['drift'],
                                        noise=args['noise'],
                                        temp=args['temperature'],
                                        timestep=args['timestep'],
                                        tol=args['tolerance'],
                                        seed=args['seed'],
                                        batch_avg=args['batch_avg'],
                                        avg_choice=args['avg_choice'],
                                        noise_choice=args['noise_choice'],
                                        device=device,
                                        lam_reg=args['reg'],
                                        batch_loss=args['batch_loss'])
        optimizer._normalize_particles()
        optimizer.X *= 179.0

        # optimizing...
        if rank == 0:
            print("Training...")
        for i in range(args['epoch']):
            if rank == 0:
                t1 = time.time()
            former = copy.deepcopy(weighted_avg)
            weighted_avg, noise_flag = optimizer.forward(
                data, label, args['problem'], weighted_avg)

            synchronize(args['num_processes'])
            if rank == 0:
                t2 = time.time()

            # validation
            acc = theta_acc(weighted_avg, theta)
            loss = loss_recons(weighted_avg, data, label)
            log_loss.append(loss)
            log_acc.append(acc)
            if rank == 0:
                print(
                    "epoch: {:d} | loss: {:f} | validation error: {:f}".format(
                        i, loss, acc))

            synchronize(args['num_processes'])

            if rank == 0:
                t3 = time.time()
                print("training time: {:f}".format(t2 - t1))
                print("test time: {:f}".format(t3 - t2))

            # check whether to reach the stopping criterion
            delta_error = torch.norm(weighted_avg - former)
            err = delta_error**2 / weighted_avg.size(0)
            print("error: ", err)
            if err <= args['tolerance_stop']:
                optimizer._initialize_particles()
                # print(former_avg)
                # average, variance = var(former_avg)
                # print(np.linalg.norm(former_avg), average, variance)
                if rank == 0:
                    print("Consensus!")
                break
        if rank == 0:
            write_res(
                './results/pic_loss_' + str(args['num_particle']) + '.npy',
                log_loss)
            write_res(
                './results/pic_acc_' + str(args['num_particle']) + '.npy',
                log_acc)
            print("Results written!")
        sys.exit("break!")
    elif args['dataset'] == "compressive_sensing":
        # data for compressive sensing
        data, label, observe, initial, L = dataset.data, dataset.label, dataset.observe, dataset.initial, dataset.L
        n_dim = dataset.sig_dim

        # initialize CBO optimizer
        if rank == 0:
            print("initialize CBO optimizer...")
        if args['num_processes'] == 1:
            optimizer = CBO_optimizer(num=args['num_particle'],
                                      dim=n_dim,
                                      drift=args['drift'],
                                      noise=args['noise'],
                                      temp=args['temperature'],
                                      timestep=args['timestep'],
                                      tol=args['tolerance'],
                                      seed=args['seed'],
                                      batch_avg=args['batch_avg'],
                                      avg_choice=args['avg_choice'],
                                      noise_choice=args['noise_choice'],
                                      device=device,
                                      lam_reg=args['reg'],
                                      batch_loss=args['batch_loss'])
        else:
            optimizer = MultiprocessCBO(num_process=args['num_processes'],
                                        rank=rank,
                                        num=args['num_particle'],
                                        dim=n_dim,
                                        drift=args['drift'],
                                        noise=args['noise'],
                                        temp=args['temperature'],
                                        timestep=args['timestep'],
                                        tol=args['tolerance'],
                                        seed=args['seed'],
                                        batch_avg=args['batch_avg'],
                                        avg_choice=args['avg_choice'],
                                        noise_choice=args['noise_choice'],
                                        device=device,
                                        lam_reg=args['reg'],
                                        batch_loss=args['batch_loss'])

        # start training
        if rank == 0:
            print("start training...")
        for j in range(args['repeat_time']):
            A = data[j]
            sig = label[j]
            b = observe[j]

            # for initial value
            optimizer._initialize_particles(j)
            X_0 = initial[j]
            gamma = math.sqrt(2.0 * args['reg'] / (L[j] + args['timestep']))
            print("L: ", L[j])
            print("gamma: ", gamma)
            loss = loss_cs(X_0, A, b, args['reg'])
            acc = acc_cs(X_0, sig)
            if rank == 0:
                print("FISTA done!")
                print("FISTA loss: ", loss)
                print("FISTA accuracy: ", acc)
            # sys.exit("sys exit: FISTA done!")

            # customize initialization for CBO
            # optimizer._custom_initialization(X_0, args['cs_std'])
            # weighted_avg = X_0.squeeze(1)
            weighted_avg = torch.zeros(X_0.size(0)).double()
            # synchronize(args['num_processes'])

            # optimizing...
            if rank == 0:
                print("training... | idx: {:d}".format(j))
            for epoch in range(args['epoch']):
                # CBO descent
                former = copy.deepcopy(weighted_avg)
                weighted_avg, noise_flag = optimizer.forward(
                    A, b, args['problem'], weighted_avg)

                # try projection operation after several CBO steps to preserve the structure
                if epoch > 0 and epoch % 5 == 0:
                    # projection
                    proj_index = (weighted_avg >= gamma).double()
                    weighted_avg = weighted_avg * proj_index
                    # optimizer.X = optimizer.X * proj_index
                    print("projection!", (weighted_avg < gamma).sum().item())

                # log recent results
                if noise_flag and rank == 0:
                    write_res(
                        './results/cs_log_loss_' + str(args['num_particle']) +
                        '.npy', log_loss)
                    write_res(
                        './results/cs_log_acc_' + str(args['num_particle']) +
                        '.npy', log_acc)

                # validation
                loss = loss_cs(weighted_avg.unsqueeze(1), A, b, args['reg'])
                acc = acc_cs(weighted_avg.unsqueeze(1), sig)
                log_loss.append(loss)
                log_acc.append(acc)
                if rank == 0:
                    print("loss: ", loss)
                    print("idx: {:d} | epoch: {:d} | error: {:f}".format(
                        j, epoch, acc))

                # check whether to reach the stopping criterion
                delta_error = torch.norm(weighted_avg - former)
                err = delta_error**2 / weighted_avg.size(0)
                # print("delta error: ", delta_error)
                if err <= args['tolerance_stop']:
                    if rank == 0:
                        print("Consensus!")
                    break

                synchronize(args['num_processes'])

            if rank == 0:
                print("idx / n_data : {:d} / {:d} is done".format(
                    j, args['repeat_time']))
            optimizer._initialize_particles(j + 1)
            synchronize(args['num_processes'])
        sys.exit("sys exit: break!")
    elif args['dataset'] == "mnist":
        # preparing data...
        trainloader = DataLoader(data_train,
                                 batch_size=args['batch_loss'],
                                 shuffle=True)
        testloader = DataLoader(data_test,
                                batch_size=args['batch_loss'],
                                shuffle=False)
        if args['problem'] == 'one_module':
            feature_dim = 784 * 10 + 10
        elif args['problem'] == 'two_module':
            feature_dim = 784 * 50 + 50 + 50 * 10 + 10
        weighted_avg = torch.zeros(1, feature_dim).double().to(device)

        # initialize CBO optimizer
        if args['num_processes'] == 1 or device == 'cuda':
            optimizer = CBO_optimizer(num=args['num_particle'],
                                      dim=feature_dim,
                                      drift=args['drift'],
                                      noise=args['noise'],
                                      temp=args['temperature'],
                                      timestep=args['timestep'],
                                      tol=args['tolerance'],
                                      seed=args['seed'],
                                      batch_avg=args['batch_avg'],
                                      avg_choice=args['avg_choice'],
                                      noise_choice=args['noise_choice'],
                                      device=device,
                                      lam_reg=args['reg'],
                                      batch_loss=args['batch_loss'])
        else:
            optimizer = MultiprocessCBO(num_process=args['num_processes'],
                                        rank=rank,
                                        num=args['num_particle'],
                                        dim=feature_dim,
                                        drift=args['drift'],
                                        noise=args['noise'],
                                        temp=args['temperature'],
                                        timestep=args['timestep'],
                                        tol=args['tolerance'],
                                        seed=args['seed'],
                                        batch_avg=args['batch_avg'],
                                        avg_choice=args['avg_choice'],
                                        noise_choice=args['noise_choice'],
                                        device=device,
                                        lam_reg=args['reg'],
                                        batch_loss=args['batch_loss'])

        # same initialzation as weight initialization in Pytorch
        if args['problem'] == 'two_module':
            optimizer._kaiming_uniform_initialization()
        elif args['problem'] == 'one_module':
            optimizer._kaiming_1_uniform_initialization()

        # optimizing...
        if rank == 0:
            print("Training...")
        epo_flag = False
        time_start = time.time()
        for i in range(args['epoch']):
            for train_batch, (data, label) in enumerate(trainloader):
                if rank == 0:
                    t1 = time.time()
                former = copy.deepcopy(weighted_avg)
                data, label = data.to(device), label.to(device)
                weighted_avg, noise_flag = optimizer.forward(
                    data, label, args['problem'], weighted_avg)

                # log recent results
                if (noise_flag and rank == 0) or (i % 5 == 0):
                    write_res(
                        './results_mnist/log_loss_train_' +
                        str(args['num_particle']) + '.npy', log_loss_train)
                    # write_res('./results_mnist/log_loss_ave_train_' + str(args['num_particle']) + '.npy', log_loss_ave_train)
                    write_res(
                        './results_mnist/log_acc_train' +
                        str(args['num_particle']) + '.npy', log_acc_train)
                    write_res(
                        './results_mnist/nnew_noise_TM_log_loss_' +
                        str(args['num_particle']) + '.npy', log_loss)
                    write_res(
                        './results_mnist/nnew_noise_TM_log_acc_' +
                        str(args['num_particle']) + '.npy', log_acc)

                # synchronize(args['num_processes'])
                # print train loss & acc
                if args['problem'] == 'one_module':
                    loss_train, acc_train = OneModule_test(
                        data, label, weighted_avg, device)
                elif args['problem'] == 'two_module':
                    loss_train, acc_train = TwoModule_test(
                        data, label, weighted_avg, device)
                log_loss_train.append(loss_train)
                # log_loss_ave_train.append(train_loss_ave)
                log_acc_train.append(acc_train)
                if rank == 0:
                    print(
                        "epoch: {:d} | iter: {:d} | train loss: {:f} | train precision: {:f}"
                        .format(i, train_batch, loss_train.item(),
                                acc_train.item()))
                    t2 = time.time()
                    print("training time: {:f}".format(t2 - t1))

                # validation
                loss_sum = 0
                acc_sum = 0
                loss_ave = 0
                t3 = time.time()
                for val_batch, (data_val, label_val) in enumerate(testloader):
                    data_val, label_val = data_val.to(device), label_val.to(
                        device)
                    if args['problem'] == 'one_module':
                        loss_val, acc_val = OneModule_test(
                            data_val, label_val, weighted_avg, device)
                    elif args['problem'] == 'two_module':
                        loss_val, acc_val = TwoModule_test(
                            data_val, label_val, weighted_avg, device)
                    loss_sum += loss_val.item() * label_val.size(0)
                    acc_sum += acc_val.item() * label_val.size(0)
                loss_sum /= len(testloader.dataset)
                acc_sum /= len(testloader.dataset)
                log_loss.append(loss_sum)
                log_acc.append(acc_sum)
                t4 = time.time()
                if rank == 0:
                    # print("loss: ", loss)
                    print(
                        "epoch: {:d} | loss: {:f} | validation precision: {:f}"
                        .format(i, loss_sum, acc_sum))
                    print("test time: {:f}".format(t3 - t2))

                synchronize(args['num_processes'])

                # check whether to reach the stopping criterion
                delta_error = torch.norm(weighted_avg - former)
                err = delta_error**2 / weighted_avg.size(0)
                if err <= args['tolerance_stop']:
                    res_loss[0] = loss_sum
                    res_acc[0] = acc_sum
                    # print(former_avg)
                    # average, variance = var(former_avg)
                    # print(np.linalg.norm(former_avg), average, variance)
                    if rank == 0:
                        print("Consensus!")
                    epo_flag = True
                    break

            synchronize(args['num_processes'])

        if rank == 0:
            time_end = time.time()
            print('epoch time', time_end - time_start)
            # sys.exit("time!")
            write_res(
                './results_mnist/log_loss_train_' + str(args['num_particle']) +
                '.npy', log_loss_train)
            write_res(
                './results_mnist/log_acc_train' + str(args['num_particle']) +
                '.npy', log_acc_train)
            write_res(
                './results_mnist/nnew_noise_TM_log_loss_' +
                str(args['num_particle']) + '.npy', log_loss)
            write_res(
                './results_mnist/nnew_noise_TM_log_acc_' +
                str(args['num_particle']) + '.npy', log_acc)
            print("Results written!")
        sys.exit("break!")
    elif args['dataset'] == 'ras':
        # initilize...
        B = torch.tensor(args['B']).double()
        C = torch.tensor(args['C']).double()
        weighted_avg = torch.zeros(1, args['ras_dim']).double()

        # initialize CBO optimizer
        if args['num_processes'] == 1:
            optimizer = CBO_optimizer(num=args['num_particle'],
                                      dim=args['ras_dim'],
                                      drift=args['drift'],
                                      noise=args['noise'],
                                      temp=args['temperature'],
                                      timestep=args['timestep'],
                                      tol=args['tolerance'],
                                      seed=args['seed'],
                                      batch_avg=args['batch_avg'],
                                      avg_choice=args['avg_choice'],
                                      noise_choice=args['noise_choice'],
                                      device=device,
                                      lam_reg=args['reg'],
                                      batch_loss=args['batch_loss'])
        else:
            optimizer = MultiprocessCBO(num_process=args['num_processes'],
                                        rank=rank,
                                        num=args['num_particle'],
                                        dim=args['ras_dim'],
                                        drift=args['drift'],
                                        noise=args['noise'],
                                        temp=args['temperature'],
                                        timestep=args['timestep'],
                                        tol=args['tolerance'],
                                        seed=args['seed'],
                                        batch_avg=args['batch_avg'],
                                        avg_choice=args['avg_choice'],
                                        noise_choice=args['noise_choice'],
                                        device=device,
                                        lam_reg=args['reg'],
                                        batch_loss=args['batch_loss'])

        # optimizing...
        if rank == 0:
            print("Training...")
        log_gap_list = list()
        min_gap_list = list()
        success_sum = 0
        for times in range(args['repeat_time']):
            log_gap = list()
            set_random_seed(times)
            optimizer._uniform_initialization_particles(times)
            acc_flag = False
            min_gap = 1e5
            for i in range(args['epoch']):
                if rank == 0:
                    t1 = time.time()
                former = copy.deepcopy(weighted_avg)
                weighted_avg, noise_flag = optimizer.forward(
                    B, C, args['problem'], weighted_avg)
                flag_acc = 0

                if rank == 0:
                    t2 = time.time()

                # validation
                pred = rastrigin(weighted_avg, B, C)
                gap = torch.log(abs(pred - C))
                log_gap.append(gap)
                min_gap = min(min_gap, torch.norm(weighted_avg - B)**2)
                if (abs(weighted_avg - B) < 0.25).all():
                    flag_acc = 1
                    acc_flag = True
                if rank == 0:
                    # print("loss: ", loss)
                    print(
                        "time: {:d} | epoch: {:d} | loss: {:f} | success: {:d}"
                        .format(times, i, gap.item(), flag_acc))

                synchronize(args['num_processes'])

                if rank == 0:
                    t3 = time.time()
                    print("training time: {:f}".format(t2 - t1))
                    print("test time: {:f}".format(t3 - t2))
                # if pred <= 0 or flag_acc:
                #     break
                #     print(weighted_avg)

            log_gap_list.append(log_gap)
            min_gap_list.append(min_gap)
            success_sum += flag_acc
            if rank == 0:
                if acc_flag:
                    print("success!")
                else:
                    print("failed!")
                write_res(
                    './results/NNCBO_rastrigin_log_loss_' +
                    str(args['num_particle']) + '_' + str(args['ras_dim']) +
                    '.npy', log_gap_list)
                write_res(
                    './results/NNCBO_rastrigin_min_gap_' +
                    str(args['num_particle']) + '_' + str(args['ras_dim']) +
                    '.npy', min_gap_list)
                print("Results written!")
        print(success_sum)
        sys.exit("break!")
Example #13
0
async def main() -> None:
    semaphore = asyncio.Semaphore(value=3)
    await asyncio.gather(
        *(synchronize(semaphore)(fetch)(file, checkpoint=10000)
          for file in glob.glob(_path("../data/*.pgn"))))
Example #14
0
    def training(self):
        self.seg_net.train()
        self.generator.train()
        self.feature_extracted.eval()
        for param in self.feature_extracted.parameters():
            param.requires_grad = False

        save_to_disk = ptutil.get_rank() == 0
        start_training_time = time.time()
        trained_time = 0
        best_miou = 0
        mean = torch.tensor([0.485, 0.456,
                             0.406]).float().cuda().view(1, 3, 1, 1)
        std = torch.tensor([0.229, 0.224,
                            0.225]).float().cuda().view(1, 3, 1, 1)
        tic = time.time()
        end = time.time()
        iteration, max_iter = 0, self.max_iter
        save_iter, eval_iter = self.per_iter * self.config.TRAIN.SAVE_EPOCH, self.per_iter * self.config.TRAIN.EVAL_EPOCH
        # save_iter, eval_iter = 10, 10
        self.logger.info(
            "Start training, total epochs {:3d} = total iteration: {:6d}".
            format(self.config.TRAIN.EPOCHS, max_iter))
        for i, (source_image, label) in enumerate(self.train_loader):
            iteration += 1
            self.scheduler.step()
            # self.optimizer.zero_grad()
            self.gen_scheduler.step()
            # self.gen_optimizer.zero_grad()
            source_image, label = source_image.to(self.device,
                                                  dtype=self.dtype), label.to(
                                                      self.device)
            try:
                _, batch = self.target_trainloader_iter.__next__()
            except:
                self.target_trainloader_iter = enumerate(self.target_loader)
                _, batch = self.target_trainloader_iter.__next__()
            target_image = batch.to(self.device, dtype=self.dtype)
            if self.config.DATASET.IMG_TRANSFORM == False:
                source_image = source_image.permute(0, 3, 1, 2)
                target_image = target_image.permute(0, 3, 1, 2)
                source_image_norm = (((source_image / 255) - mean) / std)
                target_image_norm = (((target_image / 255) - mean) / std)
            else:
                source_image_norm = source_image
                target_image_norm = target_image
            source_feature = self.feature_extracted(source_image_norm)
            target_feature = self.feature_extracted(target_image_norm)

            target_feature_mean = torch.mean(target_feature, (2, 3),
                                             keepdim=True)
            target_feature_var = torch.std(target_feature, (2, 3),
                                           keepdim=True)
            source_feature_mean = torch.mean(source_feature, (2, 3),
                                             keepdim=True)
            source_feature_var = torch.std(source_feature, (2, 3),
                                           keepdim=True)

            adain_feature = (
                (source_feature - source_feature_mean) /
                (source_feature_var + 0.00001)) * (
                    target_feature_var + 0.00001) + target_feature_mean
            gen_image_norm = self.generator(adain_feature)
            gen_image = ((gen_image_norm * std) + mean) * 255

            gen_image_feature = self.feature_extracted(gen_image_norm)
            gen_image_feature_mean = torch.mean(gen_image_feature, (2, 3),
                                                keepdim=True)
            gen_image_feature_var = torch.std(gen_image_feature, (2, 3),
                                              keepdim=True)
            #adain_feature <--> gen_image_feature gen_image_feature gen_image_feature_mean <--> target_feature_mean
            #gen_image_feature_var <--> target_feature_var
            loss_feature_dict = self.gen_criterion(gen_image_feature,
                                                   adain_feature)
            loss_mean_dict = self.gen_criterion(gen_image_feature_mean,
                                                target_feature_mean)
            loss_var_dict = self.gen_criterion(gen_image_feature_var,
                                               target_feature_var)

            loss_feature = sum(loss for loss in loss_feature_dict.values())
            loss_feature_dict_reduced = ptutil.reduce_loss_dict(
                loss_feature_dict)
            loss_feature_reduced = sum(
                loss for loss in loss_feature_dict_reduced.values())

            loss_mean = sum(loss for loss in loss_mean_dict.values())
            loss_mean_dict_reduced = ptutil.reduce_loss_dict(loss_mean_dict)
            loss_mean_reduced = sum(
                loss for loss in loss_mean_dict_reduced.values())

            loss_var = sum(loss for loss in loss_var_dict.values())
            loss_var_dict_reduced = ptutil.reduce_loss_dict(loss_var_dict)
            loss_var_reduced = sum(loss
                                   for loss in loss_var_dict_reduced.values())

            loss_gen = loss_feature + loss_mean + loss_var
            # train source image
            outputs = self.seg_net(source_image)
            source_seg_loss_dict = self.criterion(outputs, label)
            # train gen image
            gen_outputs = self.seg_net(gen_image)
            gen_seg_loss_dict = self.criterion(gen_outputs, label)
            # reduce losses over all GPUs for logging purposes
            outputs = outputs.detach()
            kl_loss_dict = self.kl_criterion(gen_outputs, outputs)

            source_seg_loss_dict_reduced = ptutil.reduce_loss_dict(
                source_seg_loss_dict)
            # print(type(loss_dict_reduced))
            source_seg_losses_reduced = sum(
                loss for loss in source_seg_loss_dict_reduced.values())
            source_seg_loss = sum(loss
                                  for loss in source_seg_loss_dict.values())
            # source_seg_loss.backward()
            gen_seg_loss_dict_reduced = ptutil.reduce_loss_dict(
                gen_seg_loss_dict)
            gen_seg_losses_reduced = sum(
                loss for loss in gen_seg_loss_dict_reduced.values())
            gen_seg_loss = sum(loss for loss in gen_seg_loss_dict.values())
            kl_loss_dict_reduced = ptutil.reduce_loss_dict(kl_loss_dict)
            kl_losses_reduced = sum(loss
                                    for loss in kl_loss_dict_reduced.values())
            kl_loss = sum(loss for loss in kl_loss_dict.values())
            loss_seg = source_seg_loss + gen_seg_loss + kl_loss * 10
            # loss_seg.backward(retain_graph=True)
            # loss = loss_gen + loss_seg
            # loss.backward()
            if config.TRAIN.MIXED_PRECISION:
                with amp.scale_loss(loss_gen, self.gen_optimizer,
                                    loss_id=1) as errGen_scale:
                    errGen_scale.backward()
                with amp.scale_loss(loss_seg, self.optimizer,
                                    loss_id=2) as errSeg_scale:
                    errSeg_scale.backward()
            else:
                loss = loss_gen + loss_seg
                loss.backward()

            if iteration % 8 == 0:
                self.optimizer.step()
                self.gen_optimizer.step()
                self.optimizer.zero_grad()
                self.gen_optimizer.zero_grad()
            trained_time += time.time() - end
            end = time.time()
            if iteration % self.config.TRAIN.LOG_STEP == 0:
                eta_seconds = int(
                    (trained_time / iteration) * (max_iter - iteration))
                log_str = [
                    "Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}".
                    format(iteration, self.optimizer.param_groups[0]['lr'],
                           time.time() - tic,
                           str(datetime.timedelta(seconds=eta_seconds))),
                    "source_seg_loss: {:.6f}, gen_seg_loss:{:.6f}, kl_loss:{:.6f}"
                    .format(source_seg_losses_reduced.item(),
                            gen_seg_losses_reduced.item(),
                            kl_losses_reduced.item() * 10),
                    "feature_loss:{:.6f}, mean_loss:{:.6f}, var_loss:{:.6f}".
                    format(loss_feature_reduced.item(),
                           loss_mean_reduced.item(), loss_var_reduced.item())
                ]
                log_str = ', '.join(log_str)
                self.logger.info(log_str)
                tic = time.time()
            if save_to_disk and iteration % save_iter == 0:
                model_path = os.path.join(
                    self.seg_dir, "{}_{}_{}_iter_{:06d}.pth".format(
                        self.config.MODEL.SEG_NET, self.config.TRAIN.SEG_LOSS,
                        self.config.DATASET.NAME, iteration))
                # self.save_model(model_path)
                ptutil.save_model(self.seg_net, model_path, self.logger)
                generator_path = os.path.join(
                    self.generator_dir, '{}_{}_{}_iter_{:06d}.pth'.format(
                        self.config.MODEL.TARGET_GENERATOR,
                        self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME,
                        iteration))
                # self.save_model_generator(generator_path)
                ptutil.save_model(self.generator, generator_path, self.logger)
            # Do eval when training, to trace the mAP changes and see performance improved whether or nor
            if self.config.TRAIN.EVAL_EPOCH > 0 and iteration % eval_iter == 0 and not iteration == max_iter:
                metrics = ptutil.validate(self.seg_net, self.valid_loader,
                                          self.metric, self.device,
                                          self.config)
                ptutil.synchronize()
                pixAcc, mIoU = ptutil.accumulate_metric(metrics)
                if mIoU != None and mIoU >= best_miou:
                    best_miou = mIoU
                    model_path = os.path.join(
                        self.seg_dir,
                        "{}_{}_{}_best.pth".format(self.config.MODEL.SEG_NET,
                                                   self.config.TRAIN.SEG_LOSS,
                                                   self.config.DATASET.NAME))
                    ptutil.save_model(self.seg_net, model_path, self.logger)
                    generator_path = os.path.join(
                        self.generator_dir, '{}_{}_{}_best.pth'.format(
                            self.config.TRAIN.TARGET_GENERATOR,
                            self.config.TRAIN.SEG_LOSS,
                            self.config.DATASET.NAME))
                    ptutil.save_model(self.generator, generator_path,
                                      self.logger)
                if pixAcc is not None:
                    self.logger.info('pixAcc: {:.4f}, mIoU: {:.4f}'.format(
                        pixAcc, mIoU))
                self.seg_net.train()
        if save_to_disk:
            model_path = os.path.join(
                self.seg_dir,
                "{}_{}_{}_iter_{:06d}.pth".format(self.config.TRAIN.SEG_NET,
                                                  self.config.TRAIN.SEG_LOSS,
                                                  self.config.DATASET.NAME,
                                                  max_iter))
            ptutil.save_model(self.seg_net, model_path, self.logger)
            generator_path = os.path.join(
                self.generator_dir, '{}_{}_{}_iter_{:06d}.pth'.format(
                    self.config.MODEL.TARGET_GENERATOR,
                    self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME,
                    max_iter))
            ptutil.save_model(self.generator, generator_path, self.logger)
        # compute training time
        total_training_time = int(time.time() - start_training_time)
        total_time_str = str(datetime.timedelta(seconds=total_training_time))
        self.logger.info("Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / max_iter))
        # eval after training
        if not self.config.TRAIN.SKIP_EVAL:
            metrics = ptutil.validate(self.seg_net, self.valid_loader,
                                      self.metric, self.device, self.config)
            ptutil.synchronize()
            pixAcc, mIoU = ptutil.accumulate_metric(metrics)
            if pixAcc is not None:
                self.logger.info(
                    'After training, pixAcc: {:.4f}, mIoU: {:.4f}'.format(
                        pixAcc, mIoU))
Example #15
0
    def training(self):
        self.net.train()
        save_to_disk = ptutil.get_rank() == 0
        start_training_time = time.time()
        trained_time = 0
        tic = time.time()
        end = time.time()
        iteration, max_iter = 0, self.args.max_iter
        save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, self.args.per_iter * self.args.eval_epoch
        # save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, 10

        logger.info("Start training, total epochs {:3d} = total iteration: {:6d}".format(self.args.epochs, max_iter))

        # TODO: add mixup
        for i, batch in enumerate(self.train_loader):
            iteration += 1
            self.scheduler.step()
            image = batch[0].to(self.device)
            fixed_targets = [batch[it].to(self.device) for it in range(1, 6)]
            gt_boxes = batch[6].to(self.device)

            self.optimizer.zero_grad()
            loss_dict = self.net(image, gt_boxes, *fixed_targets)
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss = sum(loss for loss in loss_dict.values())
            loss.backward()
            self.optimizer.step()
            trained_time += time.time() - end
            end = time.time()
            if iteration % args.log_step == 0:
                eta_seconds = int((trained_time / iteration) * (max_iter - iteration))
                log_str = ["Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}"
                               .format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic,
                                       str(datetime.timedelta(seconds=eta_seconds))),
                           "total_loss: {:.3f}".format(losses_reduced.item())]
                for loss_name, loss_item in loss_dict_reduced.items():
                    log_str.append("{}: {:.3f}".format(loss_name, loss_item.item()))
                log_str = ', '.join(log_str)
                logger.info(log_str)
                tic = time.time()
            if save_to_disk and iteration % save_iter == 0:
                model_path = os.path.join(self.args.save_dir, "{}_iter_{:06d}.pth"
                                          .format(self.save_prefix, iteration))
                self.save_model(model_path)
            # Do eval when training, to trace the mAP changes and see performance improved whether or nor
            if self.args.eval_epoch > 0 and iteration % eval_iter == 0 and not iteration == max_iter:
                metrics = self.validate()
                ptutil.synchronize()
                names, values = ptutil.accumulate_metric(metrics)
                if names is not None:
                    log_str = ['{}: {:.5f}'.format(k, v) for k, v in zip(names, values)]
                    log_str = '\n'.join(log_str)
                    logger.info(log_str)
                self.net.train()
        if save_to_disk:
            model_path = os.path.join(self.args.save_dir, "{}_iter_{:06d}.pth"
                                      .format(self.save_prefix, max_iter))
            self.save_model(model_path)

        # compute training time
        total_training_time = int(time.time() - start_training_time)
        total_time_str = str(datetime.timedelta(seconds=total_training_time))
        logger.info(
            "Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter))
Example #16
0
    def training(self):
        self.net.train()
        save_to_disk = ptutil.get_rank() == 0
        start_training_time = time.time()
        trained_time = 0
        tic = time.time()
        end = time.time()
        iteration, max_iter = 0, self.args.max_iter
        save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, self.args.per_iter * self.args.eval_epochs
        # save_iter, eval_iter = 10, 10

        logger.info(
            "Start training, total epochs {:3d} = total iteration: {:6d}".
            format(self.args.epochs, max_iter))

        for i, (image, target) in enumerate(self.train_loader):
            iteration += 1
            self.scheduler.step()
            self.optimizer.zero_grad()
            image, target = image.to(self.device), target.to(self.device)
            outputs = self.net(image)
            loss_dict = self.criterion(outputs, target)
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss = sum(loss for loss in loss_dict.values())
            loss.backward()
            self.optimizer.step()
            trained_time += time.time() - end
            end = time.time()
            if iteration % args.log_step == 0:
                eta_seconds = int(
                    (trained_time / iteration) * (max_iter - iteration))
                log_str = [
                    "Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}".
                    format(iteration, self.optimizer.param_groups[0]['lr'],
                           time.time() - tic,
                           str(datetime.timedelta(seconds=eta_seconds))),
                    "total_loss: {:.3f}".format(losses_reduced.item())
                ]
                log_str = ', '.join(log_str)
                logger.info(log_str)
                tic = time.time()
            if save_to_disk and iteration % save_iter == 0:
                model_path = os.path.join(
                    self.args.save_dir,
                    "{}_iter_{:06d}.pth".format('LEDNet', iteration))
                self.save_model(model_path)
            # Do eval when training, to trace the mAP changes and see performance improved whether or nor
            if args.eval_epochs > 0 and iteration % eval_iter == 0 and not iteration == max_iter:
                metrics = self.validate()
                ptutil.synchronize()
                pixAcc, mIoU = ptutil.accumulate_metric(metrics)
                if pixAcc is not None:
                    logger.info('pixAcc: {:.4f}, mIoU: {:.4f}'.format(
                        pixAcc, mIoU))
                self.net.train()
        if save_to_disk:
            model_path = os.path.join(
                self.args.save_dir,
                "{}_iter_{:06d}.pth".format('LEDNet', max_iter))
            self.save_model(model_path)
        # compute training time
        total_training_time = int(time.time() - start_training_time)
        total_time_str = str(datetime.timedelta(seconds=total_training_time))
        logger.info("Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / max_iter))
        # eval after training
        if not self.args.skip_eval:
            metrics = self.validate()
            ptutil.synchronize()
            pixAcc, mIoU = ptutil.accumulate_metric(metrics)
            if pixAcc is not None:
                logger.info(
                    'After training, pixAcc: {:.4f}, mIoU: {:.4f}'.format(
                        pixAcc, mIoU))
Example #17
0
def train(is_dist, start_epoch, local_rank):
    transforms = transform.build_transforms()
    coco_dataset = dataset.COCODataset(is_train=True, transforms=transforms)
    if (is_dist):
        sampler = distributedGroupSampler(coco_dataset)
    else:
        sampler = groupSampler(coco_dataset)
    dataloader = build_dataloader(coco_dataset, sampler)

    batch_time_meter = utils.AverageMeter()
    cls_loss_meter = utils.AverageMeter()
    reg_loss_meter = utils.AverageMeter()
    losses_meter = utils.AverageMeter()

    model = retinanet(is_train=True)
    if (start_epoch == 1):
        model.resnet.load_pretrained(pretrained_path[cfg.resnet_depth])
    else:
        utils.load_model(model, start_epoch - 1)
    model = model.cuda()

    if is_dist:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[
                local_rank,
            ],
            output_device=local_rank,
            broadcast_buffers=False)
    optimizer = solver.build_optimizer(model)
    scheduler = solver.scheduler(optimizer)

    model.train()
    logs = []

    for epoch in range(start_epoch, cfg.max_epochs + 1):
        if is_dist:
            dataloader.sampler.set_epoch(epoch - 1)
        scheduler.lr_decay(epoch)

        end_time = time.time()
        for iteration, datas in enumerate(dataloader, 1):
            scheduler.linear_warmup(epoch, iteration - 1)
            images = datas["images"]
            bboxes = datas["bboxes"]
            labels = datas["labels"]
            res_img_shape = datas["res_img_shape"]
            pad_img_shape = datas["pad_img_shape"]

            images = images.cuda()
            bboxes = [bbox.cuda() for bbox in bboxes]
            labels = [label.cuda() for label in labels]

            loss_dict = model(images,
                              gt_bboxes=bboxes,
                              gt_labels=labels,
                              res_img_shape=res_img_shape,
                              pad_img_shape=pad_img_shape)
            cls_loss = loss_dict["cls_loss"]
            reg_loss = loss_dict["reg_loss"]

            losses = cls_loss + reg_loss
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            batch_time_meter.update(time.time() - end_time)
            end_time = time.time()

            cls_loss_meter.update(cls_loss.item())
            reg_loss_meter.update(reg_loss.item())
            losses_meter.update(losses.item())

            if (iteration % 50 == 0):
                if (local_rank == 0):
                    res = "\t".join([
                        "Epoch: [%d/%d]" % (epoch, cfg.max_epochs),
                        "Iter: [%d/%d]" % (iteration, len(dataloader)),
                        "Time: %.3f (%.3f)" %
                        (batch_time_meter.val, batch_time_meter.avg),
                        "cls_loss: %.4f (%.4f)" %
                        (cls_loss_meter.val, cls_loss_meter.avg),
                        "reg_loss: %.4f (%.4f)" %
                        (reg_loss_meter.val, reg_loss_meter.avg),
                        "Loss: %.4f (%.4f)" %
                        (losses_meter.val, losses_meter.avg),
                        "lr: %.6f" % (optimizer.param_groups[0]["lr"]),
                    ])
                    print(res)
                    logs.append(res)
                batch_time_meter.reset()
                cls_loss_meter.reset()
                reg_loss_meter.reset()
                losses_meter.reset()
        if (local_rank == 0):
            utils.save_model(model, epoch)
        if (is_dist):
            utils.synchronize()

    if (local_rank == 0):
        with open("logs.txt", "w") as f:
            for i in logs:
                f.write(i + "\n")