Esempio n. 1
0
def generate_train_val_dataloader(dataset,
                                  batch_size,
                                  num_workers,
                                  shuffle=True,
                                  split=0.9,
                                  use_fraction_of_data=1.):
    """
    return two Dataloaders split into training and validation
    `split` sets the train/val split fraction (0.9 is 90 % training data)
    u
    """
    ## this is a testing feature to make epochs go faster, uses only some of the available data
    if use_fraction_of_data < 1.:
        n_samples = int(use_fraction_of_data * len(dataset))
    else:
        n_samples = len(dataset)
    inds = np.arange(n_samples)
    train_inds, val_inds = train_test_split(inds,
                                            test_size=1 - split,
                                            train_size=split)

    train_loader = DataLoader(dataset,
                              sampler=SubsetRandomSampler(train_inds),
                              batch_size=batch_size,
                              shuffle=shuffle,
                              num_workers=num_workers)
    val_loader = DataLoader(dataset,
                            sampler=SubsetRandomSampler(val_inds),
                            batch_size=batch_size,
                            shuffle=shuffle,
                            num_workers=num_workers)
    return train_loader, val_loader
Esempio n. 2
0
def generate_train_val_dataloader(dataset, batch_size, num_workers, shuffle=False, split=0.8, fraction_of_data=1., train_inds=None, val_inds=None):
    """
    return two Data`s split into training and validation
    `split` sets the train/val split fraction (0.9 is 90 % training data)
    u
    """
    if train_inds == None:
    	inds = np.arange(len(dataset))
    	inds = inds[:int(np.ceil(len(inds)*fraction_of_data))]
    	if fraction_of_data < 1:
        	print 'using ' + str(len(inds)) + ' data points total'
    	train_inds, val_inds = train_test_split(inds, test_size=1-split, train_size=split)

    train_loader = DataLoader(
        dataset,
        sampler=SubsetRandomSampler(train_inds),
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers
    )
    val_loader = DataLoader(
        dataset,
        sampler=SubsetRandomSampler(val_inds),
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers
    )
    return train_loader, val_loader, train_inds, val_inds
Esempio n. 3
0
def plot_accuracy(train_acc, val_acc, num_epochs, figsize=(8, 6)):
    fig = plt.figure(figsize=figsize)
    t = np.linspace(0, num_epochs, len(train_acc))
    plt.plot(t, train_acc, label="Training")
    t = np.arange(num_epochs) + 1
    plt.plot(t, val_acc, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("F2 Score")
    plt.legend()
    return fig
Esempio n. 4
0
def triple_train_val_balance_dataloaders(datasets,
                                         batch_size,
                                         num_workers,
                                         shuffle=True,
                                         split=0.9,
                                         use_fraction_of_data=1.):
    """
    generate three training and three validation dataloaders
    to train triple resnet
    """
    n_samples = len(datasets[0])
    ## set up train val split
    inds = np.arange(n_samples)
    train_inds, val_inds = train_test_split(inds,
                                            test_size=1 - split,
                                            train_size=split)
    ## logical indexing to use with BalanceSampler
    log_train_inds = np.zeros(n_samples)
    log_train_inds[train_inds] = 1
    log_val_inds = np.zeros(n_samples)
    log_val_inds[val_inds] = 1
    ## reduce the size of your dataset (use for testing only)
    if use_fraction_of_data < 1:
        train_idx = int(np.ceil(use_fraction_of_data * split * n_samples))
        val_idx = int(np.ceil(use_fraction_of_data * (1 - split) * n_samples))
        log_train_inds[train_idx:] = 0
        log_val_inds[val_idx:] = 0

    train_loaders = []
    val_loaders = []

    for dset in datasets:
        train_loaders.append(
            BalanceDataLoader(dset,
                              sampler=BalanceSampler(dset, log_train_inds),
                              batch_size=batch_size,
                              shuffle=shuffle,
                              num_workers=num_workers))
        val_loaders.append(
            BalanceDataLoader(dset,
                              sampler=BalanceSampler(dset, log_val_inds),
                              batch_size=batch_size,
                              shuffle=shuffle,
                              num_workers=num_workers))

    return train_loaders, val_loaders
Esempio n. 5
0
def triple_train_val_dataloaders(datasets,
                                 batch_size,
                                 num_workers,
                                 shuffle=True,
                                 split=0.9,
                                 use_fraction_of_data=1.):
    """
    generate three training and three validation dataloaders
    to train triple resnet
    """
    ## this is a testing feature to make epochs go faster, uses only some of the available data
    if use_fraction_of_data < 1.:
        n_samples = int(use_fraction_of_data * len(datasets[0]))
    else:
        n_samples = len(datasets[0])
    inds = np.arange(n_samples)
    train_inds, val_inds = train_test_split(inds,
                                            test_size=1 - split,
                                            train_size=split)

    train_loaders = []
    val_loaders = []

    for dset in datasets:
        train_loaders.append(
            DataLoader(dset,
                       sampler=SubsetRandomSampler(train_inds),
                       batch_size=batch_size,
                       shuffle=shuffle,
                       num_workers=num_workers))
        val_loaders.append(
            DataLoader(dset,
                       sampler=SubsetRandomSampler(val_inds),
                       batch_size=batch_size,
                       shuffle=shuffle,
                       num_workers=num_workers))

    return train_loaders, val_loaders
Esempio n. 6
0
def run(args):
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # Load data
    train_data, test_data, train_mask, test_mask, user_list = load_data(random_split=True)
    # train_data, test_data, train_mask, test_mask, user_list = load_toy_data()

    # Params
    # n_bins = 288
    n_samples, n_bins, n_mods = train_data.shape
    n_features = n_bins * n_mods
    # n_mods = n_features // n_bins
    modalities = ['cpm', 'steps', 'screen', 'location_lat', 'location_lon'][:n_mods]
    num_train = train_data.shape[0] // args.batch_size
    num_test = test_data.shape[0] // args.batch_size

    # Convert to torch tensor
    train_data = torch.from_numpy(train_data)
    test_data = torch.from_numpy(test_data)
    train_mask = torch.from_numpy(train_mask).float()
    test_mask = torch.from_numpy(test_mask).float()

    def get_batch(source, mask, i, evaluation=False):
        data = Variable(source[i * args.batch_size:(i + 1) * args.batch_size], volatile=evaluation)
        _mask = Variable(mask[i * args.batch_size:(i + 1) * args.batch_size], volatile=evaluation)
        return data, _mask

    if args.model.lower() == 'vae':
        model = VAE(args.layers, input_dim=n_features, args=args)
    elif args.model.lower() == 'rae':
        model = CRAE(args.layers, input_dim=n_features, args=args)
    elif args.model.lower() == 'unet':
        model = SUnet(args.layers, input_dim=n_features, args=args)
    elif args.model.lower() == 'avb':
        model = AVB(args.layers, input_dim=n_features, args=args)
    else:
        model = SDAE(args.layers, input_dim=n_features, args=args)
    print(model)

    def train(epoch):
        model.train()
        train_loss = 0
        for batch_idx in range(num_train):
            data, mask = get_batch(train_data, train_mask, batch_idx, evaluation=False)

            if args.cuda:
                data = data.cuda()

            # Run model updates and collect loss
            loss = model.forward(data, mask)
            train_loss += loss

            if batch_idx % args.log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_data),
                           100. * batch_idx / num_train,
                           loss / len(data)))

        print('====> Epoch: {} Average loss: {:.6f}'.format(
            epoch, train_loss / len(train_data)))
        return train_loss / len(train_data)

    def test(epoch):
        model.eval()
        test_loss = 0
        for batch_idx in range(num_test):
            data, mask = get_batch(test_data, test_mask, batch_idx, evaluation=True)
            if args.cuda:
                data = data.cuda()

            # Evaluate batch on model
            test_loss += model.eval_loss(data, mask)

        test_loss /= len(test_data)
        print('====> Test set loss: {:.6f}'.format(test_loss))
        return test_loss

    train_loss = list()
    test_loss = list()
    for epoch in range(1, args.epochs + 1):
        train_loss.append(train(epoch))
        test_loss.append(test(epoch))

    # Plot result
    test_batch, test_mask_batch = get_batch(test_data, test_mask, 0, evaluation=True)

    if 'vae' in args.model:
        recon_batch, mu, logvar, noise = model(test_batch, test_mask_batch)
    else:
        recon_batch, noise = model(test_batch, test_mask_batch)

    # Mask out known values
    test_batch = test_batch * test_mask_batch
    recon_batch = recon_batch * test_mask_batch  # * (1 - noise)

    test_batch = test_batch.data.numpy().reshape(-1, n_bins, n_mods)
    recon_batch = recon_batch.data.numpy().reshape(-1, n_bins, n_mods)

    # fig, ax = plt.subplots(nrows=2, ncols=n_mods, figsize=(10 * n_mods, 20))
    # for i, mod in enumerate(modalities):
    #     vmax = np.max((test_batch[:, :, i].max(), recon_batch[:, :, i].max()))
    #     sns.heatmap(test_batch[:, :, i], ax=ax[0, i], vmin=0, vmax=vmax)
    #     sns.heatmap(recon_batch[:, :, i], ax=ax[1, i], vmin=0, vmax=vmax)
    # plt.savefig('{}_recon_heatmap'.format(args.model))
    #
    # # Plot error curves
    # fig, ax = plt.subplots(figsize=(20, 10))
    # ax.plot(range(args.epochs - 1), train_loss[1:], label='train')
    # ax.plot(range(args.epochs - 1), test_loss[1:], label='test')
    # plt.savefig('{}_error'.format(args.model))

    # Create a visdom object
    vis = Visdom(env=args.model)

    # Heatmap
    for i, mod in enumerate(modalities):
        vmax = np.max((test_batch[:, :, i].max(), recon_batch[:, :, i].max()))
        vis.heatmap(test_batch[:, :, i],
                    opts=dict(colormap='Electric', title='true_' + mod, xmin=0, xmax=float(vmax)))
        vis.heatmap(recon_batch[:, :, i],
                    opts=dict(colormap='Electric', title='recon_' + mod, xmin=0, xmax=float(vmax)))
    vis.heatmap(((1 - noise) * test_mask_batch)[:, :, 0].data.numpy(), opts=dict(title='mask'))

    # Errors
    vis.line(np.stack((train_loss[1:], test_loss[1:]), axis=1),
             np.tile(np.arange(args.epochs - 1), (2, 1)).transpose(),
             opts=dict(legend=['train', 'test']))

    return train_loss[-1], test_loss[-1]
Esempio n. 7
0
                              sampler=BalanceSampler(dset, log_val_inds),
                              batch_size=batch_size,
                              shuffle=shuffle,
                              num_workers=num_workers))

    return train_loaders, val_loaders


if __name__ == '__main__':

    from balance_batch_dataloader import *

    csv_path = 'data/train_v2.csv'
    img_path = 'data/train-jpg'
    img_ext = '.jpg'
    dtype = torch.FloatTensor
    training_dataset = ResnetOptimizeDataset(csv_path, img_path, dtype)
    inds = np.arange(10000)
    logical_inds = np.zeros(len(training_dataset))
    logical_inds[inds] = 1
    bbs = BalanceSampler(training_dataset, logical_inds)
    print(len(bbs))
    train_loader = BalanceDataLoader(training_dataset,
                                     sampler=bbs,
                                     batch_size=32,
                                     num_workers=1)
    for t, (x, y) in enumerate(train_loader):
        col_sum = y.sum(dim=0).numpy().flatten()
        print(col_sum > 0)
        break