def runExperiment():
    seed = int(cfg['model_tag'].split('_')[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    dataset = fetch_dataset(cfg['data_name'], cfg['subset'])
    process_dataset(dataset['train'])
    data_loader = make_data_loader(dataset)
    model = eval('models.{}().to(cfg["device"])'.format(cfg['model_name']))
    optimizer = make_optimizer(model)
    scheduler = make_scheduler(optimizer)
    if cfg['resume_mode'] == 1:
        last_epoch, model, optimizer, scheduler, logger = resume(
            model, cfg['model_tag'], optimizer, scheduler)
    elif cfg['resume_mode'] == 2:
        last_epoch = 1
        _, model, _, _, _ = resume(model, cfg['model_tag'])
        current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S')
        logger_path = 'output/runs/{}_{}'.format(cfg['model_tag'],
                                                 current_time)
        logger = Logger(logger_path)
    else:
        last_epoch = 1
        current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S')
        logger_path = 'output/runs/train_{}_{}'.format(cfg['model_tag'],
                                                       current_time)
        logger = Logger(logger_path)
    if cfg['world_size'] > 1:
        model = torch.nn.DataParallel(model,
                                      device_ids=list(range(
                                          cfg['world_size'])))
    for epoch in range(last_epoch, cfg['num_epochs'] + 1):
        logger.safe(True)
        train(data_loader['train'], model, optimizer, logger, epoch)
        test(data_loader['train'], model, logger, epoch)
        if cfg['scheduler_name'] == 'ReduceLROnPlateau':
            scheduler.step(
                metrics=logger.mean['test/{}'.format(cfg['pivot_metric'])])
        else:
            scheduler.step()
        logger.safe(False)
        model_state_dict = model.module.state_dict(
        ) if cfg['world_size'] > 1 else model.state_dict()
        save_result = {
            'cfg': cfg,
            'epoch': epoch + 1,
            'model_dict': model_state_dict,
            'optimizer_dict': optimizer.state_dict(),
            'scheduler_dict': scheduler.state_dict(),
            'logger': logger
        }
        save(save_result,
             './output/model/{}_checkpoint.pt'.format(cfg['model_tag']))
        if cfg['pivot'] > logger.mean['test/{}'.format(cfg['pivot_metric'])]:
            cfg['pivot'] = logger.mean['test/{}'.format(cfg['pivot_metric'])]
            shutil.copy(
                './output/model/{}_checkpoint.pt'.format(cfg['model_tag']),
                './output/model/{}_best.pt'.format(cfg['model_tag']))
        logger.reset()
    logger.safe(False)
    return
def runExperiment():
    cfg['batch_size']['train'] = cfg['batch_size']['test']
    seed = int(cfg['model_tag'].split('_')[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    dataset = fetch_dataset(cfg['data_name'], cfg['subset'])
    process_dataset(dataset)
    model = eval(
        'models.{}(model_rate=cfg["global_model_rate"]).to(cfg["device"]).to(cfg["device"])'
        .format(cfg['model_name']))
    last_epoch, data_split, label_split, model, _, _, _ = resume(
        model, cfg['model_tag'], load_tag='best', strict=False)
    current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S')
    logger_path = 'output/runs/test_{}_{}'.format(cfg['model_tag'],
                                                  current_time)
    test_logger = Logger(logger_path)
    test_logger.safe(True)
    test(dataset['test'], model, test_logger, last_epoch)
    test_logger.safe(False)
    _, _, _, _, _, _, train_logger = resume(model,
                                            cfg['model_tag'],
                                            load_tag='checkpoint',
                                            strict=False)
    save_result = {
        'cfg': cfg,
        'epoch': last_epoch,
        'logger': {
            'train': train_logger,
            'test': test_logger
        }
    }
    save(save_result, './output/result/{}.pt'.format(cfg['model_tag']))
    return
def runExperiment():
    seed = int(cfg['model_tag'].split('_')[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    dataset = fetch_dataset(cfg['data_name'], cfg['subset'])
    process_dataset(dataset)
    model = eval('models.{}(model_rate=cfg["global_model_rate"]).to(cfg["device"])'.format(cfg['model_name']))
    optimizer = make_optimizer(model, cfg['lr'])
    scheduler = make_scheduler(optimizer)
    if cfg['resume_mode'] == 1:
        last_epoch, data_split, label_split, model, optimizer, scheduler, logger = resume(model, cfg['model_tag'],
                                                                                          optimizer, scheduler)
    elif cfg['resume_mode'] == 2:
        last_epoch = 1
        _, data_split, label_split, model, _, _, _ = resume(model, cfg['model_tag'])
        current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S')
        logger_path = 'output/runs/{}_{}'.format(cfg['model_tag'], current_time)
        logger = Logger(logger_path)
    else:
        last_epoch = 1
        data_split, label_split = split_dataset(dataset, cfg['num_users'], cfg['data_split_mode'])
        current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S')
        logger_path = 'output/runs/train_{}_{}'.format(cfg['model_tag'], current_time)
        logger = Logger(logger_path)
    if data_split is None:
        data_split, label_split = split_dataset(dataset, cfg['num_users'], cfg['data_split_mode'])
    global_parameters = model.state_dict()
    federation = Federation(global_parameters, cfg['model_rate'], label_split)
    for epoch in range(last_epoch, cfg['num_epochs']['global'] + 1):
        logger.safe(True)
        train(dataset['train'], data_split['train'], label_split, federation, model, optimizer, logger, epoch)
        test_model = stats(dataset['train'], model)
        test(dataset['test'], data_split['test'], label_split, test_model, logger, epoch)
        if cfg['scheduler_name'] == 'ReduceLROnPlateau':
            scheduler.step(metrics=logger.mean['train/{}'.format(cfg['pivot_metric'])])
        else:
            scheduler.step()
        logger.safe(False)
        model_state_dict = model.state_dict()
        save_result = {
            'cfg': cfg, 'epoch': epoch + 1, 'data_split': data_split, 'label_split': label_split,
            'model_dict': model_state_dict, 'optimizer_dict': optimizer.state_dict(),
            'scheduler_dict': scheduler.state_dict(), 'logger': logger}
        save(save_result, './output/model/{}_checkpoint.pt'.format(cfg['model_tag']))
        if cfg['pivot'] < logger.mean['test/{}'.format(cfg['pivot_metric'])]:
            cfg['pivot'] = logger.mean['test/{}'.format(cfg['pivot_metric'])]
            shutil.copy('./output/model/{}_checkpoint.pt'.format(cfg['model_tag']),
                        './output/model/{}_best.pt'.format(cfg['model_tag']))
        logger.reset()
    logger.safe(False)
    return
Example #4
0
def runExperiment():
    seed = int(cfg['model_tag'].split('_')[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    dataset = fetch_dataset(cfg['data_name'], cfg['subset'])
    process_dataset(dataset['train'])
    if 'pixelcnn' in cfg['model_name']:
        ae = eval('models.{}().to(cfg["device"])'.format(cfg['ae_name']))
        _, ae, _, _, _ = resume(ae, cfg['ae_tag'], load_tag='best')
    else:
        ae = None
    model = eval('models.{}().to(cfg["device"])'.format(cfg['model_name']))
    _, model, _, _, _ = resume(model, cfg['model_tag'], load_tag='best')
    generate(model, ae)
    return
Example #5
0
def run_test(args_dict):

    print("Start test of model...")

    # Set up device
    if torch.cuda.is_available():
        args_dict.device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc.
        print("Running on the GPU")
    else:
        args_dict.device = torch.device("cpu")
        print("Running on the CPU")

    args_dict.resume = True
    # Define model
    if args_dict.model == "covidnet":
        model = CovidNet(args_dict.n_classes)
    else:
        model = ResNet(args_dict.n_classes)

    model.to(args_dict.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args_dict.lr)

    best_sensit, model, optimizer = utils.resume(args_dict, model, optimizer)

    dl_test = calculateDataLoaderTest(args_dict)

    valEpoch(args_dict, dl_test, model)
Example #6
0
def resume(id):
    fpath = utils.resume(id)
    config["g_dir"] = fpath["g_dir"]
    config["goptim_dir"] = fpath["goptim_dir"]
    config["a_dir"] = fpath["a_dir"]
    config["aoptim_dir"] = fpath["aoptim_dir"]
    config["b_dir"] = fpath["b_dir"]
    config["boptim_dir"] = fpath["boptim_dir"]
def runExperiment():
    seed = int(cfg['model_tag'].split('_')[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    dataset = fetch_dataset(cfg['data_name'], cfg['subset'])
    process_dataset(dataset['train'])
    model = eval('models.{}().to(cfg["device"])'.format(cfg['model_name']))
    _, model, _, _, _ = resume(model, cfg['model_tag'], load_tag='best')
    transit(model)
    return
Example #8
0
def runExperiment():
    seed = int(cfg['model_tag'].split('_')[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    dataset = fetch_dataset(cfg['data_name'], cfg['subset'])
    process_dataset(dataset['train'])
    data_loader = make_data_loader(dataset)
    ae = eval('models.{}().to(cfg["device"])'.format(cfg['ae_name']))
    _, ae, _, _, _ = resume(ae, cfg['ae_tag'], load_tag='best')
    model = eval('models.{}().to(cfg["device"])'.format(cfg['model_name']))
    load_tag = 'best'
    last_epoch, model, _, _, _ = resume(model, cfg['model_tag'], load_tag=load_tag)
    current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S')
    logger_path = 'output/runs/train_{}_{}'.format(cfg['model_tag'], current_time)
    logger = Logger(logger_path)
    logger.safe(True)
    test(data_loader['train'], ae, model, logger, last_epoch)
    logger.safe(False)
    save_result = {'cfg': cfg, 'epoch': last_epoch, 'logger': logger}
    save(save_result, './output/result/{}.pt'.format(cfg['model_tag']))
    return
Example #9
0
def run_gradcam(
    args_dict
):  # call this function to get the gradcam pictures and output - only for one picture
    print("Start test of model...")
    args_dict.batch = 1

    # Set up device
    if torch.cuda.is_available():
        args_dict.device = torch.device(
            "cuda:0"
        )  # you can continue going on here, like cuda:1 cuda:2....etc.
        print("Running on the GPU")
    else:
        args_dict.device = torch.device("cpu")
        print("Running on the CPU")

    args_dict.resume = True
    # Define model
    if args_dict.model == "covidnet":
        model = CovidNet(args_dict.n_classes)
    else:
        model = ResNet(args_dict.n_classes)

    model.to(args_dict.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args_dict.lr)

    best_sensit, model, optimizer = utils.resume(args_dict, model, optimizer)

    dl_test = eval.calculateDataLoaderTest(args_dict)
    for batch_idx, (x_batch, y_batch, _) in enumerate(dl_test):
        x_batch, y_batch = x_batch.to(args_dict.device), y_batch.to(
            args_dict.device)
        if batch_idx == 2:
            break

    output = model(x_batch)
    pred = np.argmax(output.cpu().data.numpy(), axis=1)

    if args_dict.model == 'resnet':
        heatmap, image = grad_cam(model, x_batch)
    elif args_dict.model == 'covidnet':
        heatmap, image = grad_cam_covid(model, x_batch, output, pred)

    # plt.imshow(image, interpolation='nearest')
    print(heatmap.shape)
    print(image.shape)
    plt.imshow(heatmap)
    plt.show()

    return heatmap, image, output
Example #10
0
def run_calibration(args_dict):
    """
    Apply Temperature  scaling for callibration and saves the new model
    """

    print("Start calibration of model...")

    args_dict.resume = True

    # Define model
    if args_dict.model == "covidnet":
        model = CovidNet(args_dict.n_classes)
    elif args_dict.model == "resnet":
        model = ResNet(args_dict.n_classes)

    # Set up device
    if torch.cuda.is_available():
        args_dict.device = torch.device(
            "cuda:0"
        )  # you can continue going on here, like cuda:1 cuda:2....etc.
        print("Running on the GPU")
    else:
        args_dict.device = torch.device("cpu")
        print("Running on the CPU")

    model.to(args_dict.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args_dict.lr)

    dl_test = eval.calculateDataLoaderTest(args_dict)

    _, model, _ = utils.resume(args_dict, model, optimizer)
    # model.eval()

    scaled_model = ModelWithTemperature(model)
    scaled_model.set_temperature(dl_test, args_dict.device)

    print("saving calibrated model")
    utils.save_model(
        args_dict, {
            'epoch': args_dict.start_epoch,
            'state_dict': scaled_model.state_dict(),
            'optimizer': optimizer.state_dict()
        })

    plot_calibration(args_dict)
Example #11
0
def main_train_loop(save_dir, model, args):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    n_class = len(args.cates)
    #resume chekckpoint
    start_epoch = 0
    optimizer = initilize_optimizer(model, args)
    if args.resume_checkpoint is None and os.path.exists(
            os.path.join(save_dir, 'checkpoint-latest.pt')):
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
    if args.resume_checkpoint is not None:
        if args.resume_optimizer:
            model, optimizer, start_epoch = resume(
                args.resume_checkpoint,
                model,
                optimizer,
                strict=(not args.resume_non_strict))
        else:
            model, _, start_epoch = resume(args.resume_checkpoint,
                                           model,
                                           optimizer=None,
                                           strict=(not args.resume_non_strict))
        print('Resumed from: ' + args.resume_checkpoint)

    #initilize dataset and load
    tr_dataset, te_dataset = get_datasets(args)

    train_sampler = None  # for non distributed training

    train_loader = torch.utils.data.DataLoader(dataset=tr_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True,
                                               worker_init_fn=np.random.seed(
                                                   args.seed))
    test_loader = torch.utils.data.DataLoader(dataset=te_dataset,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=0,
                                              pin_memory=True,
                                              drop_last=False,
                                              worker_init_fn=np.random.seed(
                                                  args.seed))

    #initialize the learning rate scheduler
    if args.scheduler == 'exponential':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, args.exp_decay)
    elif args.scheduler == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=args.epochs // 2,
                                              gamma=0.1)
    elif args.scheduler == 'linear':

        def lambda_rule(ep):
            lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float(
                0.5 * args.epochs)
            return lr_l

        scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                lr_lambda=lambda_rule)
    else:
        assert 0, "args.schedulers should be either 'exponential' or 'linear'"

    #training starts from here
    tot_nelbo = []
    tot_kl_loss = []
    tot_x_reconst = []

    best_eval_metric = float('+inf')

    for epoch in range(start_epoch, args.epochs):
        # adjust the learning rate
        if (epoch + 1) % args.exp_decay_freq == 0:
            scheduler.step(epoch=epoch)
        #train for one epoch
        model.train()
        for bidx, data in enumerate(train_loader):
            idx_batch, tr_batch, te_batch = data['idx'], data[
                'train_points'], data['test_points']
            obj_type = data['cate_idx']
            y_one_hot = obj_type.new(
                np.eye(n_class)[obj_type]).to(device).float()
            step = bidx + len(train_loader) * epoch

            if args.random_rotate:
                tr_batch, _, _ = apply_random_rotation(
                    tr_batch, rot_axis=train_loader.dataset.gravity_axis)

            inputs = tr_batch.to(device)
            y_one_hot = y_one_hot.to(device)
            optimizer.zero_grad()
            inputs_dict = {'x': inputs, 'y_class': y_one_hot}
            ret = model(inputs_dict)
            loss, nelbo, kl_loss, x_reconst, cl_loss = ret['loss'], ret[
                'nelbo'], ret['kl_loss'], ret['x_reconst'], ret['cl_loss']
            loss.backward()
            optimizer.step()

            cur_loss = loss.cpu().item()
            cur_nelbo = nelbo.cpu().item()
            cur_kl_loss = kl_loss.cpu().item()
            cur_x_reconst = x_reconst.cpu().item()
            cur_cl_loss = cl_loss.cpu().item()
            tot_nelbo.append(cur_nelbo)
            tot_kl_loss.append(cur_kl_loss)
            tot_x_reconst.append(cur_x_reconst)
            if step % args.log_freq == 0:
                print(
                    "Epoch {0:6d} Step {1:12d} Loss {2:12.6f} Nelbo {3:12.6f} KL Loss {4:12.6f} Reconst Loss {5:12.6f} CL_Loss{6:12.6f}"
                    .format(epoch, step, cur_loss, cur_nelbo, cur_kl_loss,
                            cur_x_reconst, cur_cl_loss))

        #save checkpoint
        if (epoch + 1) % args.save_freq == 0:
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-%d.pt' % epoch))
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-latest.pt'))

            eval_metric = evaluate_model(model, te_dataset, args)
            train_metric = evaluate_model(model, tr_dataset, args)

            print('Checkpoint: Dev Reconst Loss:{0}, Train Reconst Loss:{1}'.
                  format(eval_metric, train_metric))
            if eval_metric < best_eval_metric:
                best_eval_metric = eval_metric
                save(model, optimizer, epoch + 1,
                     os.path.join(save_dir, 'checkpoint-best.pt'))
                print('new best model found!')

    save(model, optimizer, args.epochs,
         os.path.join(save_dir, 'checkpoint-latest.pt'))
    #save final visuliztion of 10 samples
    model.eval()
    with torch.no_grad():
        samples_A = model.reconstruct_input(inputs)  #sample_point(5)
        results = []
        for idx in range(5):
            res = visualize_point_clouds(
                samples_A[idx],
                tr_batch[idx],
                idx,
                pert_order=train_loader.dataset.display_axis_order)
            results.append(res)
        res = np.concatenate(results, axis=1)
        imsave(os.path.join(save_dir, 'images', '_epoch%d.png' % (epoch)),
               res.transpose((1, 2, 0)))

    #load the best model and compute eval metric:
    best_model_path = os.path.join(save_dir, 'checkpoint-best.pt')
    ckpt = torch.load(best_model_path)
    model.load_state_dict(ckpt['model'], strict=True)
    eval_metric = evaluate_model(model, te_dataset, args)
    train_metric = evaluate_model(model, tr_dataset, args)
    print(
        'Best model at epoch:{2} Dev Reconst Loss:{0}, Train Reconst Loss:{1}'.
        format(eval_metric, train_metric, ckpt['epoch']))
Example #12
0
def main(
        batch_size,
        nworkers,
        outdir,
        num_epochs,
        snapshot,
        finetune,
        lr,
        lradapt,
        experiment,
        labelimage,
        smoketest=False,
        trainpath=None,
        validpath=None
    ):

    np.random.seed(0)
    torch.manual_seed(0)

    # Visdom environment
    visdom_environment = experiment + "_" + labelimage.replace(".tif", "")
    outdir = os.path.join(outdir, visdom_environment)

    if validpath is None:
        validpath = os.environ[VALIDATA_ENVIRONMENT_VARIABLE]
    if trainpath is None:
        trainpath = os.environ[TRAINDATA_ENVIRONMENT_VARIABLE]

    train = train_houston_data_loader(trainpath, batch_size=batch_size, num_workers=nworkers,
                                      shuffle=True, validation=False, labelimage=labelimage)
    val = val_houston_data_loader(validpath, batch_size=batch_size, num_workers=nworkers,
                                  shuffle=True, validation=True, labelimage=labelimage)

    if experiment == "vhr":
        network = pspnet_10m()
    elif experiment == "s1":
        network = input_keep_res_net_34_s1_all()
    elif experiment == "s2":
        network = input_keep_res_net_34_s2_all()
    elif experiment == "vhrs1":
        network = pspnet_fused_s1_10m()
    elif experiment == "vhrs2":
        network = pspnet_fused_s2_10m()
    elif experiment == "s1s2":
        network = psp34_sentinel1_and_sentinel2()
    elif experiment == "vhrs1s2":
        network = pspnet_fused_s1s2_10m()
    else:
        raise ValueError("Please insert a valid experiment id. Valid experiments are 'vhr', 's1', 's2', 'vhrs1, 'vhrs2', 'vhrs1s2'")

    network = nn.DataParallel(network)
    if torch.cuda.is_available():
        network = network.cuda()

    if finetune or snapshot:
        resume(finetune or snapshot, network, None)
    optimizer = optim.Adam(network.parameters(), lr=lr)

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=lradapt)

    if snapshot:
        state = resume(snapshot, None, optimizer)
        train.iterations = state['iteration']

    loss = nn.NLLLoss2d()
    if torch.cuda.is_available():
        loss = loss.cuda()

    trainer = Trainer(
        network, optimizer, scheduler, loss, train, val,
        outdir, visdom_environment, smoketest
    )
    trainer.train(num_epochs, start_epoch=0)
Example #13
0
def main():
    torch.backends.cudnn.benchmark = True

    # hyper-params initializing
    args = dictobj()
    args.gpu = torch.device('cuda:%d' % (6))
    timestamp = '%d-%d-%d-%d-%d-%d-%d-%d-%d' % time.localtime(time.time())
    args.log_name = '%s-pointflow' % timestamp
    writer = SummaryWriter(comment=args.log_name)

    args.use_latent_flow, args.prior_w, args.entropy_w, args.recon_w = True, 1., 1., 1.
    args.fin, args.fz = 3, 128
    args.use_deterministic_encoder = True
    args.distributed = False
    args.optimizer = optim.Adam
    args.batch_size = 16
    args.lr, args.beta1, args.beta2, args.weight_decay = 1e-3, 0.9, 0.999, 1e-4
    args.T, args.train_T, args.atol, args.rtol = 1., False, 1e-5, 1e-5
    args.layer_type = diffop.CoScaleLinear
    args.solver = 'dopri5'
    args.use_adjoint, args.bn = True, False
    args.dims, args.num_blocks = (512, 512), 1  # originally (512 * 3)
    args.latent_dims, args.latent_num_blocks = (256, 256), 1

    args.resume, args.resume_path = False, None
    args.end_epoch = 2000
    args.scheduler, args.scheduler_step_size = optim.lr_scheduler.StepLR, 20
    args.random_rotation = True
    args.save_freq = 10

    args.dataset_type = 'shapenet15k'
    args.cates = ['airplane']  # 'all' for all categories training
    args.tr_max_sample_points, args.te_max_sample_points = 2048, 2048
    args.dataset_scale = 1.0
    args.normalize_per_shape = False
    args.normalize_std_per_axis = False
    args.num_workers = 4
    args.data_dir = "/data/ShapeNetCore.v2.PC15k"

    torch.cuda.set_device(args.gpu)
    model = PointFlow(**args).cuda(args.gpu)

    # load milestone
    epoch = 0
    optimizer = model.get_optimizer(**args)
    if args.resume:
        model, optimizer, epoch = resume(args.resume_path,
                                         model,
                                         optimizer,
                                         strict=True)
        print("Loaded model from %s" % args.resume_path)

    # load data
    train_dataset, test_dataset = get_datasets(args)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               pin_memory=True,
                                               sampler=None,
                                               drop_last=True)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              pin_memory=True,
                                              sampler=None,
                                              drop_last=False)

    if args.scheduler == optim.lr_scheduler.StepLR:
        scheduler = optim.lr_scheduler.StepLR(
            optimizer, step_size=args.scheduler_step_size, gamma=0.65)
    else:
        raise NotImplementedError("Only StepLR supported")

    ent_rec, latent_rec, recon_rec = Averager(), Averager(), Averager()
    for e in trange(epoch, args.end_epoch):
        # record lr
        if writer is not None:
            writer.add_scalar('lr/optimizer', scheduler.get_lr()[0], e)

        # feed a batch, train
        for idx, data in enumerate(tqdm(train_loader)):
            idx_batch, tr_batch, te_batch = data['idx'], data[
                'train_points'], data['test_points']
            model.train()
            if args.random_rotation:
                # raise NotImplementedError('Random Rotation Augmentation not implemented yet')
                tr_batch, _, _ = apply_random_rotation(
                    tr_batch, rot_axis=train_loader.dataset.gravity_axis)
            inputs = tr_batch.cuda(args.gpu, non_blocking=True)
            step = idx + len(train_loader) * e  # batch step
            out = model(inputs, optimizer, step, writer, sample_gpu=args.gpu)
            entropy, prior_nats, recon_nats = out['entropy'], out[
                'prior_nats'], out['recon_nats']
            ent_rec.update(entropy)
            recon_rec.update(recon_nats)
            latent_rec.update(prior_nats)

        # update lr
        scheduler.step(epoch=e)

        # save milestones
        if e % args.save_freq == 0 and e != 0:
            save(model, optimizer, e, path='milestone-%d.save' % e)
            save(model, optimizer, e,
                 path='milestone-latest.save' % e)  # save as latest model
Example #14
0
    options = interface.gui()

input_files = arguments.get_input_files(options.input)
stoichiometry = None
if options.stoichiometry is not None:
    stoichiometry = arguments.parse_stoichiometry(options.stoichiometry)
if options.verbose:
    utils.options = options
    sys.stderr.write("Input correctly parsed.\nFiles used as input:\n")
    for file in input_files:
        sys.stderr.write("\t" + file + "\n")
    sys.stderr.write("\n")

# Step 2: get possible structures for macrocomplex construction and skip others
if options.resume:
    (chains, pairs, similar_chains, structures) = utils.resume(options)
else:
    (chains, pairs, similar_chains,
     structures) = utils.get_information(input_files, options)

complexes_found = []
if options.verbose:
    sys.stderr.write("\n# Beginning to construct the complex\n\n")


# STEP4: Begin Macrocomplex reconstruction!
def construct_complex(current_complex_real, similar_chains, stoichiometry,
                      structures, used_pairs_real, clashing_real,
                      old_complex_real):
    # bruteforce ending!
    current_complex = copy.deepcopy(current_complex_real)
Example #15
0
def main_worker(gpu, save_dir, ngpus_per_node, init_data, args):
    # basic setup
    cudnn.benchmark = True
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)

    # resume training!!!
    #################################
    if args.resume_checkpoint is None and os.path.exists(os.path.join(save_dir, 'checkpoint-latest.pt')):
        args.resume_checkpoint = os.path.join(save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
        print('Checkpoint is set to the latest one.')
    #################################

    # multi-GPU setup
    model = SoftPointFlow(args)
    if args.distributed:  # Multiple processes, single GPU per process
        if args.gpu is not None:
            def _transform_(m):
                return nn.parallel.DistributedDataParallel(
                    m, device_ids=[args.gpu], output_device=args.gpu, check_reduction=True)

            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            model.multi_gpu_wrapper(_transform_)
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = 0
        else:
            assert 0, "DistributedDataParallel constructor should always set the single device scope"
    else:  # Single process, multiple GPUs per process
        def _transform_(m):
            return nn.DataParallel(m)
        model = model.cuda()
        model.multi_gpu_wrapper(_transform_)

    start_epoch = 1
    valid_loss_best = 987654321
    optimizer = model.make_optimizer(args)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
    if args.resume_checkpoint is not None:
        model, optimizer, scheduler, start_epoch, valid_loss_best, log_dir = resume(
            args.resume_checkpoint, model, optimizer, scheduler)
        model.set_initialized(True)
        print('Resumed from: ' + args.resume_checkpoint)

    else:
        log_dir = save_dir + "/runs/" + str(time.strftime('%Y-%m-%d_%H:%M:%S'))
        with torch.no_grad():
            inputs, inputs_noisy, std_in = init_data
            inputs = inputs.to(args.gpu, non_blocking=True)
            inputs_noisy = inputs_noisy.to(args.gpu, non_blocking=True)
            std_in = std_in.to(args.gpu, non_blocking=True)
            _ = model(inputs, inputs_noisy, std_in, optimizer,  None, None, init=True)
        del inputs, inputs_noisy, std_in
        print('Actnorm is initialized')

    if not args.distributed or (args.rank % ngpus_per_node == 0):
        writer = SummaryWriter(logdir=log_dir)
    else:
        writer = None

    # initialize datasets and loaders
    tr_dataset = get_trainset(args)
    te_dataset = get_testset(args)
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(tr_dataset)
        test_sampler = torch.utils.data.distributed.DistributedSampler(te_dataset)
    else:
        train_sampler = None
        test_sampler = None
        
    train_loader = torch.utils.data.DataLoader(
        dataset=tr_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=0, pin_memory=True, sampler=train_sampler, drop_last=True,
        worker_init_fn=init_np_seed)

    test_loader = torch.utils.data.DataLoader(
        dataset=te_dataset, batch_size=args.batch_size, shuffle=(test_sampler is None),
        num_workers=0, pin_memory=True, sampler=test_sampler, drop_last=True,
        worker_init_fn=init_np_seed)

    # save dataset statistics
    if not args.distributed or (args.rank % ngpus_per_node == 0):
        np.save(os.path.join(save_dir, "train_set_mean.npy"), tr_dataset.all_points_mean)
        np.save(os.path.join(save_dir, "train_set_std.npy"), tr_dataset.all_points_std)
        np.save(os.path.join(save_dir, "train_set_idx.npy"), np.array(tr_dataset.shuffle_idx))
    
    # main training loop
    if args.distributed:
        print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size()))

    seen_inputs = next(iter(train_loader))['train_points'].cuda(args.gpu, non_blocking=True)
    unseen_inputs = next(iter(test_loader))['test_points'].cuda(args.gpu, non_blocking=True)
    del test_loader

    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    for epoch in range(start_epoch, args.epochs+1):
        start_time = time.time()
        if args.distributed:
            train_sampler.set_epoch(epoch)

        if writer is not None:
            writer.add_scalar('lr/optimizer', scheduler.get_lr()[0], epoch)

        model.train()
        # train for one epoch
        
        for bidx, data in enumerate(train_loader):
            step = bidx + len(train_loader) * (epoch - 1)
            tr_batch = data['train_points']
            if args.random_rotate:
                tr_batch, _, _ = apply_random_rotation(
                    tr_batch, rot_axis=train_loader.dataset.gravity_axis)

            inputs = tr_batch.cuda(args.gpu, non_blocking=True)
            B, N, D = inputs.shape
            std = (args.std_max - args.std_min) * torch.rand_like(inputs[:,:,0]).view(B,N,1) + args.std_min

            eps = torch.randn_like(inputs) * std
            std_in = std / args.std_max * args.std_scale
            inputs_noisy = inputs + eps
            out = model(inputs, inputs_noisy, std_in, optimizer, step, writer)
            entropy, prior_nats, recon_nats, loss = out['entropy'], out['prior_nats'], out['recon_nats'], out['loss']
            if step % args.log_freq == 0:
                duration = time.time() - start_time
                start_time = time.time()
                if writer is not None:
                    writer.add_scalar('train/avg_time', duration, step)
                print("[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] Entropy %2.5f LatentNats %2.5f PointNats %2.5f loss %2.5f"
                      % (args.rank, epoch, bidx, len(train_loader), duration, entropy,
                         prior_nats, recon_nats, loss))
            del inputs, inputs_noisy, std_in, out, eps
            gc.collect()

        if epoch < args.stop_scheduler:
            scheduler.step()

        if epoch % args.valid_freq == 0:
            with torch.no_grad():
                model.eval()
                valid_loss = 0.0
                valid_entropy = 0.0
                valid_prior = 0.0
                valid_prior_nats = 0.0
                valid_recon = 0.0
                valid_recon_nats = 0.0
                for bidx, data in enumerate(train_loader):
                    step = bidx + len(train_loader) * epoch
                    tr_batch = data['test_points']
                    if args.random_rotate:
                        tr_batch, _, _ = apply_random_rotation(
                            tr_batch, rot_axis=train_loader.dataset.gravity_axis)

                    inputs = tr_batch.cuda(args.gpu, non_blocking=True)
                    B, N, D = inputs.shape
                    std = (args.std_max - args.std_min) * torch.rand_like(inputs[:,:,0]).view(B,N,1) + args.std_min

                    eps = torch.randn_like(inputs) * std
                    std_in = std / args.std_max * args.std_scale
                    inputs_noisy = inputs + eps
                    out = model(inputs, inputs_noisy, std_in, optimizer, step, writer, valid=True)
                    valid_loss += out['loss'] / len(train_loader)
                    valid_entropy += out['entropy'] / len(train_loader)
                    valid_prior += out['prior'] / len(train_loader)
                    valid_prior_nats += out['prior_nats'] / len(train_loader)
                    valid_recon += out['recon'] / len(train_loader)
                    valid_recon_nats += out['recon_nats'] / len(train_loader)
                    del inputs, inputs_noisy, std_in, out, eps
                    gc.collect()

                if writer is not None:
                    writer.add_scalar('valid/entropy', valid_entropy, epoch)
                    writer.add_scalar('valid/prior', valid_prior, epoch)
                    writer.add_scalar('valid/prior(nats)', valid_prior_nats, epoch)
                    writer.add_scalar('valid/recon', valid_recon, epoch)
                    writer.add_scalar('valid/recon(nats)', valid_recon_nats, epoch)
                    writer.add_scalar('valid/loss', valid_loss, epoch)
                
                duration = time.time() - start_time
                start_time = time.time()
                print("[Valid] Epoch %d Time [%3.2fs] Entropy %2.5f LatentNats %2.5f PointNats %2.5f loss %2.5f loss_best %2.5f"
                    % (epoch, duration, valid_entropy, valid_prior_nats, valid_recon_nats, valid_loss, valid_loss_best))
                if valid_loss < valid_loss_best:
                    valid_loss_best = valid_loss
                    if not args.distributed or (args.rank % ngpus_per_node == 0):
                        save(model, optimizer, epoch + 1, scheduler, valid_loss_best, log_dir,
                            os.path.join(save_dir, 'checkpoint-best.pt'))
                        print('best model saved!')

        if epoch % args.save_freq == 0 and (not args.distributed or (args.rank % ngpus_per_node == 0)):
            save(model, optimizer, epoch + 1, scheduler, valid_loss_best, log_dir,
                os.path.join(save_dir, 'checkpoint-%d.pt' % epoch))
            save(model, optimizer, epoch + 1, scheduler, valid_loss_best, log_dir,
                os.path.join(save_dir, 'checkpoint-latest.pt'))
            print('model saved!')

        # save visualizations
        if epoch % args.viz_freq == 0:
            with torch.no_grad():
                # reconstructions
                model.eval()
                samples = model.reconstruct(unseen_inputs)
                results = []
                for idx in range(min(16, unseen_inputs.size(0))):
                    res = visualize_point_clouds(samples[idx], unseen_inputs[idx], idx,
                                                pert_order=train_loader.dataset.display_axis_order)

                    results.append(res)
                res = np.concatenate(results, axis=1)
                imageio.imwrite(os.path.join(save_dir, 'images', 'SPF_epoch%d-gpu%s_recon_unseen.png' % (epoch, args.gpu)),
                                res.transpose(1, 2, 0))
                if writer is not None:
                    writer.add_image('tr_vis/conditioned', torch.as_tensor(res), epoch)

                samples = model.reconstruct(seen_inputs)
                results = []
                for idx in range(min(16, seen_inputs.size(0))):
                    res = visualize_point_clouds(samples[idx], seen_inputs[idx], idx,
                                                pert_order=train_loader.dataset.display_axis_order)

                    results.append(res)
                res = np.concatenate(results, axis=1)
                imageio.imwrite(os.path.join(save_dir, 'images', 'SPF_epoch%d-gpu%s_recon_seen.png' % (epoch, args.gpu)),
                                res.transpose(1, 2, 0))
                if writer is not None:
                    writer.add_image('tr_vis/conditioned', torch.as_tensor(res), epoch)

                num_samples = min(16, unseen_inputs.size(0))
                num_points = unseen_inputs.size(1)
                _, samples = model.sample(num_samples, num_points)
                results = []
                for idx in range(num_samples):
                    res = visualize_point_clouds(samples[idx], unseen_inputs[idx], idx,
                                                pert_order=train_loader.dataset.display_axis_order)
                    results.append(res)
                res = np.concatenate(results, axis=1)
                imageio.imwrite(os.path.join(save_dir, 'images', 'SPF_epoch%d-gpu%s_sample.png' % (epoch, args.gpu)),
                                res.transpose((1, 2, 0)))
                if writer is not None:
                    writer.add_image('tr_vis/sampled', torch.as_tensor(res), epoch)
                
                print('image saved!')
Example #16
0
def main_worker(gpu, save_dir, ngpus_per_node, args):
    # basic setup
    cudnn.benchmark = True
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.distributed:
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    if args.log_name is not None:
        log_dir = "runs/%s" % args.log_name
    else:
        log_dir = "runs/time-%d" % time.time()

    if not args.distributed or (args.rank % ngpus_per_node == 0):
        writer = SummaryWriter(logdir=log_dir)
    else:
        writer = None

    if not args.use_latent_flow:  # auto-encoder only
        args.prior_weight = 0
        args.entropy_weight = 0

    # multi-GPU setup
    model = PointFlow(args)
    if args.distributed:  # Multiple processes, single GPU per process
        if args.gpu is not None:

            def _transform_(m):
                return nn.parallel.DistributedDataParallel(
                    m,
                    device_ids=[args.gpu],
                    output_device=args.gpu,
                    check_reduction=True)

            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            model.multi_gpu_wrapper(_transform_)
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = 0
        else:
            assert 0, "DistributedDataParallel constructor should always set the single device scope"
    elif args.gpu is not None:  # Single process, single GPU per process
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:  # Single process, multiple GPUs per process

        def _transform_(m):
            return nn.DataParallel(m)

        model = model.cuda()
        model.multi_gpu_wrapper(_transform_)

    # resume checkpoints
    start_epoch = 0
    optimizer = model.make_optimizer(args)
    if args.resume_checkpoint is None and os.path.exists(
            os.path.join(save_dir, 'checkpoint-latest.pt')):
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
    if args.resume_checkpoint is not None:
        if args.resume_optimizer:
            model, optimizer, start_epoch = resume(
                args.resume_checkpoint,
                model,
                optimizer,
                strict=(not args.resume_non_strict))
        else:
            model, _, start_epoch = resume(args.resume_checkpoint,
                                           model,
                                           optimizer=None,
                                           strict=(not args.resume_non_strict))
        print('Resumed from: ' + args.resume_checkpoint)

    # initialize datasets and loaders
    tr_dataset = MyDataset(args.data_dir, istest=False)
    te_dataset = MyDataset(args.data_dir, istest=True)
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            tr_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(dataset=tr_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True,
                                               worker_init_fn=init_np_seed)
    test_loader = torch.utils.data.DataLoader(dataset=te_dataset,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=0,
                                              pin_memory=True,
                                              drop_last=False,
                                              worker_init_fn=init_np_seed)

    # save dataset statistics
    # if not args.distributed or (args.rank % ngpus_per_node == 0):
    #     np.save(os.path.join(save_dir, "train_set_mean.npy"), tr_dataset.all_points_mean)
    #     np.save(os.path.join(save_dir, "train_set_std.npy"), tr_dataset.all_points_std)
    #     np.save(os.path.join(save_dir, "train_set_idx.npy"), np.array(tr_dataset.shuffle_idx))
    #     np.save(os.path.join(save_dir, "val_set_mean.npy"), te_dataset.all_points_mean)
    #     np.save(os.path.join(save_dir, "val_set_std.npy"), te_dataset.all_points_std)
    #     np.save(os.path.join(save_dir, "val_set_idx.npy"), np.array(te_dataset.shuffle_idx))

    # load classification dataset if needed
    if args.eval_classification:
        from datasets import get_clf_datasets

        def _make_data_loader_(dataset):
            return torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               drop_last=False,
                                               worker_init_fn=init_np_seed)

        clf_datasets = get_clf_datasets(args)
        clf_loaders = {
            k: [_make_data_loader_(ds) for ds in ds_lst]
            for k, ds_lst in clf_datasets.items()
        }
    else:
        clf_loaders = None

    # initialize the learning rate scheduler
    if args.scheduler == 'exponential':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, args.exp_decay)
    elif args.scheduler == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=args.epochs // 2,
                                              gamma=0.1)
    elif args.scheduler == 'linear':

        def lambda_rule(ep):
            lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float(
                0.5 * args.epochs)
            return lr_l

        scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                lr_lambda=lambda_rule)
    else:
        assert 0, "args.schedulers should be either 'exponential' or 'linear'"

    # main training loop
    start_time = time.time()
    entropy_avg_meter = AverageValueMeter()
    latent_nats_avg_meter = AverageValueMeter()
    point_nats_avg_meter = AverageValueMeter()
    if args.distributed:
        print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size()))

    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    for epoch in range(start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # adjust the learning rate
        if (epoch + 1) % args.exp_decay_freq == 0:
            scheduler.step(epoch=epoch)
            if writer is not None:
                writer.add_scalar('lr/optimizer', scheduler.get_lr()[0], epoch)

        # train for one epoch
        for bidx, data in enumerate(train_loader):
            idx_batch, tr_batch, te_batch = data['idx'], data[
                'train_points'], data['test_points']
            step = bidx + len(train_loader) * epoch
            model.train()
            inputs = tr_batch.cuda(args.gpu, non_blocking=True)
            out = model(inputs, optimizer, step, writer)
            entropy, prior_nats, recon_nats = out['entropy'], out[
                'prior_nats'], out['recon_nats']
            entropy_avg_meter.update(entropy)
            point_nats_avg_meter.update(recon_nats)
            latent_nats_avg_meter.update(prior_nats)
            if step % args.log_freq == 0:
                duration = time.time() - start_time
                start_time = time.time()
                print(
                    "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] Entropy %2.5f LatentNats %2.5f PointNats %2.5f"
                    % (args.rank, epoch, bidx, len(train_loader), duration,
                       entropy_avg_meter.avg, latent_nats_avg_meter.avg,
                       point_nats_avg_meter.avg))

        # evaluate on the validation set
        # if not args.no_validation and (epoch + 1) % args.val_freq == 0:
        #     from utils import validate
        #     validate(test_loader, model, epoch, writer, save_dir, args, clf_loaders=clf_loaders)

        # save visualizations
        if (epoch + 1) % args.viz_freq == 0:
            # reconstructions
            model.eval()
            samples = model.reconstruct(inputs)
            results = []
            for idx in range(min(10, inputs.size(0))):
                res = visualize_point_clouds(samples[idx], inputs[idx], idx)
                results.append(res)
            res = np.concatenate(results, axis=1)
            scipy.misc.imsave(
                os.path.join(
                    save_dir, 'images',
                    'tr_vis_conditioned_epoch%d-gpu%s.png' %
                    (epoch, args.gpu)), res.transpose((1, 2, 0)))
            if writer is not None:
                writer.add_image('tr_vis/conditioned', torch.as_tensor(res),
                                 epoch)

            # samples
            if args.use_latent_flow:
                num_samples = min(10, inputs.size(0))
                num_points = inputs.size(1)
                _, samples = model.sample(num_samples, num_points)
                results = []
                for idx in range(num_samples):
                    res = visualize_point_clouds(samples[idx], inputs[idx],
                                                 idx)
                    results.append(res)
                res = np.concatenate(results, axis=1)
                scipy.misc.imsave(
                    os.path.join(
                        save_dir, 'images',
                        'tr_vis_conditioned_epoch%d-gpu%s.png' %
                        (epoch, args.gpu)), res.transpose((1, 2, 0)))
                if writer is not None:
                    writer.add_image('tr_vis/sampled', torch.as_tensor(res),
                                     epoch)

        # save checkpoints
        if not args.distributed or (args.rank % ngpus_per_node == 0):
            if (epoch + 1) % args.save_freq == 0:
                save(model, optimizer, epoch + 1,
                     os.path.join(save_dir, 'checkpoint-%d.pt' % epoch))
                save(model, optimizer, epoch + 1,
                     os.path.join(save_dir, 'checkpoint-latest.pt'))
Example #17
0
def read_main_(config_file,logger,kfold_index=None,return_uAUG=True):
    """
    return a dict
    """
    POPEN = Auto_popen( config_file)

    POPEN.kfold_index =  kfold_index
    if POPEN.kfold_cv:
        if  kfold_index is None:
            raise NotImplementedError("please specify the kfold index to perform K fold cross validation")
        POPEN.vae_log_path = POPEN.vae_log_path.replace(".log","_cv%d.log"% kfold_index)
        POPEN.vae_pth_path = POPEN.vae_pth_path.replace(".pth","_cv%d.pth"% kfold_index)

    # device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # cuda2 = torch.device('cuda:2')

    # Run name
    if POPEN.run_name is None:
        run_name = POPEN.model_type + time.strftime("__%Y_%m_%d_%H:%M")
    else:
        run_name = POPEN.run_name

    # log dir
    # logger = utils.setup_logs(POPEN.vae_log_path)

    #  built model dir or check resume 
    POPEN.check_experiment(logger)


    #                               |=====================================|
    #                               |===========   setup  part  ==========|
    #                               |=====================================|

    # read data
    if return_uAUG:
        POPEN.aux_task_columns += ['with_uAUG']
    loader_ls = reader.get_dataloader(POPEN)

    # ===========  setup model  ===========
    # train_iter = iter(train_loader)
    # X,Y  = next(train_iter)
    # -- pretrain -- 
    if POPEN.pretrain_pth is not None:
    # load pretran model
        pretrain_popen = Auto_popen(POPEN.pretrain_pth)
        try:
            pretrain_model = pretrain_popen.Model_Class(*pretrain_popen.model_args)

            utils.load_model(pretrain_popen,pretrain_model,logger)
        except:
            pretrain_model = torch.load(pretrain_popen.vae_pth_path)['state_dict']

        # DL_models.LSTM_AE
        if POPEN.Model_Class == pretrain_popen.Model_Class:
            # if not POPEN.Resumable:
            #     # we only load pre-train for the first time 
            #     # later we can resume 
            model = pretrain_model
            del pretrain_model
        else:
            downstream_model = POPEN.Model_Class(*POPEN.model_args)

            # merge 
            model = MTL_models.Enc_n_Down(pretrain_model,downstream_model)

    # -- end2end -- 
    elif POPEN.path_category == "CrossStitch":
        backbone = {}
        for t in POPEN.tasks:
            task_popen = Auto_popen(POPEN.backbone_config[t])
            task_model = task_popen.Model_Class(*task_popen.model_args)
            load_model(task_popen,task_model,logger)
            backbone[t] = task_model
        POPEN.model_args = [backbone] + POPEN.model_args
        model = POPEN.Model_Class(*POPEN.model_args)
    else:
        Model_Class = POPEN.Model_Class  # DL_models.LSTM_AE
        model = Model_Class(*POPEN.model_args)
    # =========== set optimizer ===========
    if POPEN.optimizer == 'Schedule':
        optimizer = ScheduledOptim(optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                                          betas=(0.9, 0.98), 
                                          eps=1e-09, 
                                          weight_decay=1e-4, 
                                          amsgrad=True),
                               n_warmup_steps=20)
    elif type(POPEN.optimizer) == dict:
        optimizer = eval(scheduleoptim_dict_str.format(**POPEN.optimizer))
    else:
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                                lr=POPEN.lr,
                                betas=(0.9, 0.98), 
                                eps=1e-09, 
                                weight_decay=POPEN.l2)

    if POPEN.loss_schema == 'DTP':
        POPEN.loss_schedualer = Dynamic_Task_Priority(POPEN.tasks,POPEN.gamma,POPEN.chimerla_weight)
    elif POPEN.loss_schema == 'DWA':
        POPEN.loss_schedualer = Dynamic_Weight_Averaging(POPEN.tasks,POPEN.tau,POPEN.chimerla_weight)
    # =========== resume ===========
    best_loss = np.inf
    best_acc = 0
    best_epoch = 0
    previous_epoch = 0
    if POPEN.Resumable:
        model,previous_epoch,best_loss,best_acc = resume(POPEN,model,optimizer,logger)


    # =========== fix parameters ===========
    if POPEN.modual_to_fix in dir(model):
        model = fix_parameter(model,POPEN.modual_to_fix)
        logger.info(' \t \t ==============<<< %s part is fixed>>>============== \t \t \n'%POPEN.modual_to_fix)
    
    return {"popen":POPEN,"model":model,"loader_ls":loader_ls}
Example #18
0
def main(
    batch_size,
    num_mini_batches,
    nworkers,
    datadir,
    outdir,
    num_epochs,
    snapshot,
    finetune,
    lr,
    n_classes,
    loadvgg,
    network_type,
    fusion,
    data,
):

    n_classes = 2
    tile_size = 960
    channel_basis = {
        'pre_img10': 3,
        'post_img10': 3,
        'pre_sar': 1,
        'post_sar': 1,
        'vhr': 3
    }
    channel_dict = dict()
    np.random.seed(0)
    network_type = 'baseline_vhr'

    for item in data:
        channel_dict['{}'.format(item)] = channel_basis[item]

    if network_type == 'baseline_vhr':
        network = damage_net_vhr(n_classes=n_classes)
        network.load_state_dict(
            model_zoo.load_url(
                'https://download.pytorch.org/models/resnet50-19c8e357.pth'))
    elif network_type == 'baseline_s1':
        network = damage_net_s1(n_classes=n_classes)
        network.load_state_dict(
            model_zoo.load_url(
                'https://download.pytorch.org/models/resnet50-19c8e357.pth'))
    elif network_type == 'baseline_s2':
        network = damage_net_s2(n_classes=n_classes)
        network.load_state_dict(
            model_zoo.load_url(
                'https://download.pytorch.org/models/resnet50-19c8e357.pth'))
    elif network_type == 'damagenet_fusion_simple':
        network = damage_net_vhr_fusion_simple(n_classes=n_classes)
        network.load_state_dict(
            model_zoo.load_url(
                'https://download.pytorch.org/models/resnet50-19c8e357.pth'))

    if pwd.getpwuid(os.getuid())[0] == "jf330":
        finetune = "/Users/jf330/Downloads/results/epoch_{:02}_classes_{:02}.pth".format(
            num_epochs, n_classes)
    elif pwd.getpwuid(os.getuid())[0] == "timrudner":
        finetune = "/Volumes/Data/Google_Drive/AYA_Google_Drive/Git/fdl-eo/code/damage-density-estimation/src/results/epoch_{:02}_classes_{:02}.pth".format(
            num_epochs, n_classes)
    else:
        finetune = "/results/epoch_{:02}_classes_{:02}.pth".format(
            num_epochs, n_classes)

    val = val_houston_data_loader(batch_size=batch_size,
                                  num_workers=nworkers,
                                  channels=channel_dict,
                                  tile_size=tile_size,
                                  n_classes=n_classes,
                                  shuffle=False,
                                  validation=True)
    metric = classmetric()

    if torch.cuda.is_available():
        network = network.cuda()

    if loadvgg == True:
        network.load_vgg16_weights()

    if torch.cuda.is_available():
        network = nn.DataParallel(network).cuda()
    # else:
    #    network = nn.DataParallel(network)

    param_groups = [{'params': network.parameters(), 'lr': lr}]

    if finetune or snapshot:
        state = resume(finetune or snapshot, network, None)
    loss_str_list = []

    network.eval()
    for iteration, data in enumerate(val):

        input = data[1]
        input_id = data[0]

        upsample = nn.Upsample(size=(int(tile_size / 1.25),
                                     int(tile_size / 1.25)),
                               mode='bilinear',
                               align_corners=True)  # Harvey
        target = upsample(data[2]["label"])

        if torch.cuda.is_available():
            target = Variable(target.float()).cuda()
        else:
            target = Variable(target.float())

        output_raw = network.forward(input)

        # Normalize
        if n_classes == 1:
            output = output_raw
        else:
            soft = nn.Softmax2d()
            output = soft(output_raw)

        output = upsample(output)

        train_metric = metric(target, output)
        loss_str_list.append("Input ID: {}; Metric: {} ".format(
            input_id, str(train_metric)))

        # convert zo W x H x C
        if torch.cuda.is_available():
            prediction = output.data.cuda()[0].permute(1, 2, 0)
            target = target.data.cuda()[0].permute(1, 2, 0)
        else:
            prediction = output.data[0].permute(1, 2, 0)
            target = target.data[0].permute(1, 2, 0)

        if not os.path.exists(RESULTS_PATH + "/img"):
            os.makedirs(RESULTS_PATH + "/img")

        # Remove extra dim
        if n_classes == 1:
            prediction_img = prediction.cpu().numpy()
        else:
            prediction_img = np.argmax(prediction, n_classes).cpu().numpy()

        target_img = target.cpu().numpy()

        # Write input image (only first 3 bands)
        # input_img = input.squeeze(0).cpu().numpy()
        #
        # if input_img[:, 0, 0].size >= 3:
        #     input_img = cv2.merge((input_img[0], input_img[1], input_img[2]))
        # else:
        #     input_img = input_img[0]

        #upsample = nn.Upsample(size=(int(tile_size/1.25), int(tile_size/1.25)), mode='bilinear', align_corners=True)  # Harvey

        # cv2.imwrite(RESULTS_PATH+"/img/{}_input_class_{:02}.png".format(iteration, n_classes), input_img*255)
        cv2.imwrite(
            RESULTS_PATH +
            "/img/{}_prediction_class_{:02}.png".format(iteration, n_classes),
            prediction_img * 255)
        cv2.imwrite(
            RESULTS_PATH +
            "/img/{}_target_class_{:02}.png".format(iteration, n_classes),
            target_img * 255)
        #exit(0)

        with open(RESULTS_PATH + "/MSEloss.csv", "w") as output:
            writer = csv.writer(output, delimiter=';', lineterminator='\n')
            for val in loss_str_list:
                writer.writerow([val])
Example #19
0
def main_worker(save_dir, args):
    # basic setup
    cudnn.benchmark = True

    if args.log_name is not None:
        log_dir = "runs/%s" % args.log_name
    else:
        log_dir = f"runs/{datetime.datetime.now().strftime('%m-%d-%H-%M-%S')}"

    if args.local_rank == 0:
        logger = SummaryWriter(log_dir)
    else:
        logger = None

    deepspeed.init_distributed(dist_backend='nccl')
    torch.cuda.set_device(args.local_rank)

    model = SetVAE(args)
    parameters = model.parameters()

    n_parameters = sum(p.numel() for p in parameters if p.requires_grad)
    print(f'number of params: {n_parameters}')
    try:
        n_gen_parameters = sum(p.numel() for p in model.init_set.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.pre_decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.post_decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.output.parameters() if p.requires_grad)
        print(f'number of generator params: {n_gen_parameters}')
    except AttributeError:
        pass

    optimizer, criterion = model.make_optimizer(args)

    # initialize datasets and loaders
    train_dataset, val_dataset, train_loader, val_loader = get_datasets(args)

    # initialize the learning rate scheduler
    if args.scheduler == 'exponential':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(
            optimizer, args.exp_decay)
    elif args.scheduler == 'step':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=args.epochs // 2,
                                                    gamma=0.1)
    elif args.scheduler == 'linear':

        def lambda_rule(ep):
            lr_w = min(1., ep /
                       args.warmup_epochs) if (args.warmup_epochs > 0) else 1.
            lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float(
                0.5 * args.epochs)
            return lr_l * lr_w

        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lambda_rule)
    elif args.scheduler == 'cosine':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=args.epochs)
    else:
        # Fake SCHEDULER
        def lambda_rule(ep):
            return 1.0

        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lambda_rule)

    # extract collate_fn
    if args.distributed:
        collate_fn = deepcopy(train_loader.collate_fn)
        model, optimizer, train_loader, scheduler = deepspeed.initialize(
            args=args,
            model=model,
            optimizer=optimizer,
            model_parameters=parameters,
            training_data=train_dataset,
            collate_fn=collate_fn,
            lr_scheduler=scheduler)

    # resume checkpoints
    start_epoch = 0
    if args.resume_checkpoint is None and Path(
            Path(save_dir) / 'checkpoint-latest.pt').exists():
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
        print('Resumed from: ' + args.resume_checkpoint)
    if args.resume_checkpoint is not None:
        if args.distributed:
            if args.resume_optimizer:
                model.module, model.optimizer, model.lr_scheduler, start_epoch = resume(
                    args.resume_checkpoint,
                    model.module,
                    model.optimizer,
                    scheduler=model.lr_scheduler,
                    strict=(not args.resume_non_strict))
            else:
                model.module, _, _, start_epoch = resume(
                    args.resume_checkpoint,
                    model.module,
                    optimizer=None,
                    strict=(not args.resume_non_strict))
        else:
            if args.resume_optimizer:
                model, optimizer, scheduler, start_epoch = resume(
                    args.resume_checkpoint,
                    model,
                    optimizer,
                    scheduler=scheduler,
                    strict=(not args.resume_non_strict))
            else:
                model, _, _, start_epoch = resume(
                    args.resume_checkpoint,
                    model,
                    optimizer=None,
                    strict=(not args.resume_non_strict))

    # save dataset statistics
    if args.local_rank == 0:
        train_dataset.save_statistics(save_dir)
        val_dataset.save_statistics(save_dir)

    # main training loop
    avg_meters = {
        'kl_avg_meter': AverageValueMeter(),
        'l2_avg_meter': AverageValueMeter()
    }

    assert args.distributed

    epoch = start_epoch
    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    for epoch in range(start_epoch, args.epochs):
        if args.local_rank == 0:
            # evaluate on the validation set
            if epoch % args.val_freq == 0 and epoch != 0:
                model.eval()
                with torch.no_grad():
                    val_res = validate(model.module, args, val_loader, epoch,
                                       logger, save_dir)
                    for k, v in val_res.items():
                        v = v.cpu().detach().item()
                        send_slack(f'{k}:{v}, Epoch {epoch - 1}')
                        if logger is not None and v is not None:
                            logger.add_scalar(f'val_sample/{k}', v, epoch - 1)

        # train for one epoch
        train_one_epoch(epoch, model, criterion, optimizer, args, train_loader,
                        avg_meters, logger)

        # Only on HEAD process
        if args.local_rank == 0:
            # save checkpoints
            if (epoch + 1) % args.save_freq == 0:
                if args.eval:
                    validate_reconstruct_l2(epoch, val_loader, model,
                                            criterion, args, logger)
                save(model.module, model.optimizer, model.lr_scheduler,
                     epoch + 1,
                     Path(save_dir) / f'checkpoint-{epoch}.pt')
                save(model.module, model.optimizer, model.lr_scheduler,
                     epoch + 1,
                     Path(save_dir) / 'checkpoint-latest.pt')

            # save visualizations
            if (epoch + 1) % args.viz_freq == 0:
                with torch.no_grad():
                    visualize(model.module, args, val_loader, epoch, logger)

        # adjust the learning rate
        model.lr_scheduler.step()
        if logger is not None and args.local_rank == 0:
            logger.add_scalar('train lr',
                              model.lr_scheduler.get_last_lr()[0], epoch)

    model.eval()
    if args.local_rank == 0:
        with torch.no_grad():
            val_res = validate(model.module, args, val_loader, epoch, logger,
                               save_dir)
            for k, v in val_res.items():
                v = v.cpu().detach().item()
                send_slack(f'{k}:{v}, Epoch {epoch}')
                if logger is not None and v is not None:
                    logger.add_scalar(f'val_sample/{k}', v, epoch)

    if logger is not None:
        logger.flush()
        logger.close()
def main_worker(gpu, save_dir, args):
    # basic setup
    cudnn.benchmark = True
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    model = HyperRegression(args)

    torch.cuda.set_device(args.gpu)
    model = model.cuda(args.gpu)
    start_epoch = 0
    optimizer = model.make_optimizer(args)
    if args.resume_checkpoint is None and os.path.exists(
            os.path.join(save_dir, 'checkpoint-latest.pt')):
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
    if args.resume_checkpoint is not None:
        if args.resume_optimizer:
            model, optimizer, start_epoch = resume(
                args.resume_checkpoint,
                model,
                optimizer,
                strict=(not args.resume_non_strict))
        else:
            model, _, start_epoch = resume(args.resume_checkpoint,
                                           model,
                                           optimizer=None,
                                           strict=(not args.resume_non_strict))
        print('Resumed from: ' + args.resume_checkpoint)

    # main training loop
    start_time = time.time()
    point_nats_avg_meter = AverageValueMeter()
    if args.distributed:
        print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size()))

    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    for epoch in range(start_epoch, args.epochs):
        print("Epoch starts:")
        data = ExampleData()
        train_loader = torch.utils.data.DataLoader(dataset=data,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=0,
                                                   pin_memory=True)
        for bidx, data in enumerate(train_loader):
            x, y = data
            x = x.float().to(args.gpu).unsqueeze(1)
            y = y.float().to(args.gpu).unsqueeze(1).unsqueeze(2)
            step = bidx + len(train_loader) * epoch
            model.train()
            recon_nats = model(x, y, optimizer, step, None)
            point_nats_avg_meter.update(recon_nats.item())
            if step % args.log_freq == 0:
                duration = time.time() - start_time
                start_time = time.time()
                print(
                    "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] PointNats %2.5f"
                    % (args.rank, epoch, bidx, len(train_loader), duration,
                       point_nats_avg_meter.avg))
        # save visualizations
        kk = 3
        if (epoch + 1) % args.viz_freq == 0:
            # reconstructions
            model.eval()
            x = torch.from_numpy(np.linspace(0, kk, num=100)).float().to(
                args.gpu).unsqueeze(1)
            _, y = model.decode(x, 100)
            x = x.cpu().detach().numpy()
            y = y.cpu().detach().numpy()
            x = np.expand_dims(x, 1).repeat(100, axis=1).flatten()
            y = y.flatten()
            figs, axs = plt.subplots(1, 1, figsize=(12, 12))
            plt.xlim([0, kk])
            plt.ylim([-2, 2])
            plt.scatter(x, y)
            plt.savefig(
                os.path.join(
                    save_dir, 'images',
                    'tr_vis_sampled_epoch%d-gpu%s.png' % (epoch, args.gpu)))
            plt.clf()
        if (epoch + 1) % args.save_freq == 0:
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-%d.pt' % epoch))
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-latest.pt'))
Example #21
0
def main_worker(gpu, save_dir, ngpus_per_node, args):
    # basic setup
    cudnn.benchmark = True
    normalize = False
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    model = HyperRegression(args)

    torch.cuda.set_device(args.gpu)
    model = model.cuda(args.gpu)
    start_epoch = 0
    optimizer = model.make_optimizer(args)
    if args.resume_checkpoint is None and os.path.exists(
            os.path.join(save_dir, 'checkpoint-latest.pt')):
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
    if args.resume_checkpoint is not None:
        if args.resume_optimizer:
            model, optimizer, start_epoch = resume(
                args.resume_checkpoint,
                model,
                optimizer,
                strict=(not args.resume_non_strict))
        else:
            model, _, start_epoch = resume(args.resume_checkpoint,
                                           model,
                                           optimizer=None,
                                           strict=(not args.resume_non_strict))
        print('Resumed from: ' + args.resume_checkpoint)

    # initialize datasets and loaders

    # initialize the learning rate scheduler
    if args.scheduler == 'exponential':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, args.exp_decay)
    elif args.scheduler == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=args.epochs // 2,
                                              gamma=0.1)
    elif args.scheduler == 'linear':

        def lambda_rule(ep):
            lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float(
                0.5 * args.epochs)
            return lr_l

        scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                lr_lambda=lambda_rule)
    else:
        assert 0, "args.schedulers should be either 'exponential' or 'linear'"

    # main training loop
    start_time = time.time()
    entropy_avg_meter = AverageValueMeter()
    latent_nats_avg_meter = AverageValueMeter()
    point_nats_avg_meter = AverageValueMeter()
    if args.distributed:
        print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size()))

    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    data = SDDData(split='train', normalize=normalize, root=args.data_dir)
    data_test = SDDData(split='test', normalize=normalize, root=args.data_dir)
    train_loader = torch.utils.data.DataLoader(dataset=data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)
    test_loader = torch.utils.data.DataLoader(dataset=data_test,
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=0,
                                              pin_memory=True)
    for epoch in range(start_epoch, args.epochs):
        # adjust the learning rate
        if (epoch + 1) % args.exp_decay_freq == 0:
            scheduler.step(epoch=epoch)

        # train for one epoch
        print("Epoch starts:")
        for bidx, data in enumerate(train_loader):
            # if bidx < 2:
            x, y = data
            #y = y.float().to(args.gpu).unsqueeze(1).repeat(1, 10).unsqueeze(2)
            x = x.float().to(args.gpu)
            y = y.float().to(args.gpu).unsqueeze(1)
            y = y.repeat(1, 20, 1)
            y += torch.randn(y.shape[0], y.shape[1], y.shape[2]).to(args.gpu)
            step = bidx + len(train_loader) * epoch
            model.train()
            recon_nats = model(x, y, optimizer, step, None)
            point_nats_avg_meter.update(recon_nats.item())
            if step % args.log_freq == 0:
                duration = time.time() - start_time
                start_time = time.time()
                print(
                    "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] PointNats %2.5f"
                    % (args.rank, epoch, bidx, len(train_loader), duration,
                       point_nats_avg_meter.avg))
                # print("Memory")
                # print(process.memory_info().rss / (1024.0 ** 3))
        # save visualizations
        if (epoch + 1) % args.viz_freq == 0:
            # reconstructions
            model.eval()
            for bidx, data in enumerate(test_loader):
                x, _ = data
                x = x.float().to(args.gpu)
                _, y_pred = model.decode(x, 100)
                y_pred = y_pred.cpu().detach().numpy().squeeze()
                # y_pred[y_pred < 0] = 0
                # y_pred[y_pred >= 0.98] = 0.98
                testing_sequence = data_test.dataset.scenes[
                    data_test.test_id].sequences[bidx]
                objects_list = []
                for k in range(3):
                    objects_list.append(
                        decode_obj(testing_sequence.objects[k],
                                   testing_sequence.id))
                objects = np.stack(objects_list, axis=0)
                gt_object = decode_obj(testing_sequence.objects[-1],
                                       testing_sequence.id)
                drawn_img_hyps = draw_hyps(testing_sequence.imgs[-1], y_pred,
                                           gt_object, objects, normalize)
                cv2.imwrite(
                    os.path.join(save_dir, 'images',
                                 str(bidx) + '-' + str(epoch) + '-hyps.jpg'),
                    drawn_img_hyps)
        if (epoch + 1) % args.save_freq == 0:
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-%d.pt' % epoch))
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-latest.pt'))
Example #22
0
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                            lr=POPEN.lr,
                            betas=(0.9, 0.98), 
                            eps=1e-09, 
                            weight_decay=POPEN.l2)
if POPEN.loss_schema == 'DTP':
    POPEN.loss_schedualer = Dynamic_Task_Priority(POPEN.tasks,POPEN.gamma,POPEN.chimerla_weight)
elif POPEN.loss_schema == 'DWA':
    POPEN.loss_schedualer = Dynamic_Weight_Averaging(POPEN.tasks,POPEN.tau,POPEN.chimerla_weight)
# =========== resume ===========
best_loss = np.inf
best_acc = 0
best_epoch = 0
previous_epoch = 0
if POPEN.Resumable:
    previous_epoch,best_loss,best_acc = utils.resume(POPEN, optimizer,logger)
    

#                               |=====================================|
#                               |==========  training  part ==========|
#                               |=====================================|
for epoch in range(POPEN.max_epoch-previous_epoch+1):
    epoch += previous_epoch
    
    #          
    logger.info("===============================|    epoch {}   |===============================".format(epoch))


    train_val.iter_train(loader_set,model=model,optimizer=optimizer,popen=POPEN,epoch=epoch)

#              -----------| validate |-----------   
def train_model(args_dict):

    # Define model
    if args_dict.model == "covidnet":
        model = CovidNet(args_dict.n_classes)
    elif args_dict.model == "resnet":
        model = ResNet(args_dict.n_classes)

    print("model selected: {}".format(args_dict.model))

    model.to(args_dict.device)

    # Loss and optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args_dict.lr)
    criterion = nn.CrossEntropyLoss(
        weight=torch.Tensor(args_dict.class_weights).to(args_dict.device))

    # Resume training if needed
    best_sensit, model, optimizer = utils.resume(args_dict, model, optimizer)
    scheduler = ReduceLROnPlateau(optimizer,
                                  factor=args_dict.factor,
                                  patience=args_dict.patience,
                                  verbose=True)

    # Load data
    dl_non_covid, dl_covid = calculateDataLoaderTrain(args_dict)

    # Data loading for test
    dl_test = eval.calculateDataLoaderTest(args_dict)

    # Now, let's start the training process!
    print('Start training...')
    pat_track = 0
    for epoch in range(args_dict.epochs):

        # Compute a training epoch
        trainEpoch(args_dict, dl_non_covid, dl_covid, model, criterion,
                   optimizer, epoch)

        # Compute a validation epoch
        sensitivity_covid, accuracy = eval.valEpoch(args_dict, dl_test, model)

        scheduler.step(accuracy)

        # save if it is the best model
        if accuracy >= 0.80:  # only compare sensitivity if we have a minimum accuracy of 0.8
            is_best = sensitivity_covid > best_sensit
            if is_best:
                print("BEST MODEL FOUND!")
                best_sensit = max(sensitivity_covid, best_sensit)
                utils.save_model(
                    args_dict,
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'best_sensit': best_sensit,
                        'optimizer': optimizer.state_dict(),
                        'valtrack': pat_track,
                        # 'freeVision': args_dict.freeVision,
                        'curr_val': accuracy,
                    })
        print(
            '** Validation: %f (best_sensitivity) - %f (current acc) - %d (patience)'
            % (best_sensit, accuracy, pat_track))

        # Plot
        plotter.plot('Sensitivity', 'test', 'sensitivity covid', epoch,
                     sensitivity_covid)
        plotter.plot('Accuracy', 'test', 'Accuracy', epoch, accuracy)
Example #24
0
def main():
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(args.seed)
    # torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %s' % args.gpu)
    logging.info("args = %s", args)

    genotype = eval("genotypes.%s" % args.arch)
    model = Network(args.init_channels, CIFAR_CLASSES, args.layers,
                    args.auxiliary, genotype)
    model.drop_path_prob = args.drop_path_prob
    model = model.cuda()
    model = torch.nn.DataParallel(model)
    if args.resume != "":
        start = utils.resume(model, os.path.join(args.resume, 'weights.pt'))
    else:
        start = 0

    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    train_transform, valid_transform = utils._data_transforms_cifar10(args)
    train_data = dset.CIFAR10(root=args.data,
                              train=True,
                              download=True,
                              transform=train_transform)
    valid_data = dset.CIFAR10(root=args.data,
                              train=False,
                              download=True,
                              transform=valid_transform)

    train_queue = torch.utils.data.DataLoader(train_data,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              pin_memory=True,
                                              num_workers=2)

    valid_queue = torch.utils.data.DataLoader(valid_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              pin_memory=True,
                                              num_workers=2)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs))

    valid_acc_best = 0.

    for epoch in range(start):
        scheduler.step()

    for epoch in range(start, args.epochs):
        scheduler.step()
        logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
        model.drop_path_prob = args.drop_path_prob * epoch / args.epochs

        train_acc, train_obj = train(train_queue, model, criterion, optimizer)
        logging.info('train_acc %f', train_acc)

        valid_acc, valid_obj = infer(valid_queue, model, criterion)
        logging.info('valid_acc %f', valid_acc)
        if valid_acc > valid_acc_best:
            utils.save(model, epoch, os.path.join(args.save, 'weights.pt'))
            valid_acc_best = valid_acc
Example #25
0
def plot_calibration(args_dict):
    args_dict.resume = True

    dl_test = eval.calculateDataLoaderTest(args_dict)

    # Define model
    if args_dict.model == "covidnet":
        model_normal = CovidNet(args_dict.n_classes)

    elif args_dict.model == "resnet":
        model_normal = ResNet(args_dict.n_classes)

    # Set up device
    if torch.cuda.is_available():
        args_dict.device = torch.device(
            "cuda:0"
        )  # you can continue going on here, like cuda:1 cuda:2....etc.
    else:
        args_dict.device = torch.device("cpu")

    # load normal model
    model_normal.to(args_dict.device)
    optimizer = torch.optim.Adam(model_normal.parameters(), lr=args_dict.lr)
    _, model_normal, _ = utils.resume(args_dict, model_normal, optimizer)

    # load calibrated model
    model_calib = ModelWithTemperature(model_normal)
    calib_model_path = args_dict.dir_model + "calibrated_" + args_dict.model + '_best_model.pth.tar'
    checkpoint_calib = torch.load(calib_model_path,
                                  map_location=torch.device(args_dict.device))
    model_calib.load_state_dict(checkpoint_calib['state_dict'])

    print("Calculating probabilities for test set...")
    probs_normal, y_true = eval.valEpoch(args_dict,
                                         dl_test,
                                         model_normal,
                                         calibration=True)
    probs_normal = softmax(probs_normal, axis=1)

    probs_calib, y_true = eval.valEpoch(args_dict,
                                        dl_test,
                                        model_calib,
                                        calibration=True)
    probs_calib = softmax(probs_calib, axis=1)

    print("calibration graph...")
    idx2class = {0: 'normal', 1: 'pneumonia', 2: 'COVID19'}

    fig, axs = plt.subplots(1, args_dict.n_classes, figsize=(15, 5))
    for idx_class in range(args_dict.n_classes):
        y_class = y_true == idx_class

        # reliability diagram
        fop_normal, mpv_normal = calibration_curve(y_class,
                                                   probs_normal[:, idx_class])
        fop_calib, mpv_calib = calibration_curve(y_class,
                                                 probs_calib[:, idx_class])
        # plot perfectly calibrated
        axs[idx_class].plot([0, 1], [0, 1], linestyle='--')
        # plot calibrated reliability
        axs[idx_class].plot(mpv_calib,
                            fop_calib,
                            marker='.',
                            label='calibrated')
        axs[idx_class].plot(mpv_normal, fop_normal, marker='*', label='normal')
        axs[idx_class].set(xlabel='confidence')
        axs[idx_class].set(ylabel='accuracy')
        # title
        axs[idx_class].set_title(idx2class[idx_class])

    for ax in axs.flat:
        ax.label_outer()

    fig.autofmt_xdate()
    plt.subplots_adjust(wspace=0.1)
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=args_dict.n_classes)
    fig.savefig('calibration_' + args_dict.model + '.png')