Beispiel #1
0
def get_video_dataloader(mode='train',
                         videos_path=os.environ['HOME'] +
                         '/Database/MSR-VTT/train-video/',
                         vocab_path='data/processed/msrvtt_vocab.pkl',
                         captions_path='data/processed/msrvtt_captions.csv',
                         batch_size=32,
                         num_frames=40,
                         max_len=30,
                         embedding_size=2048,
                         num_captions=20,
                         load_features=False,
                         load_captions=False,
                         preload=False,
                         model='resnet152',
                         num_workers=0):
    """
    Generate a dataloader with the specified parameters.

    Args:
        mode: Dataset type to load
        videos_path: Path to MSR-VTT videos dataset
        vocab_path: Path to MSR-VTT vocab file
        caption_size: Path to captions vocab file
        batch_size: Batch size for Dataloader
        num_frames: Number of frames per video to process
        max_len: Max caption length
        embedding_size: Size of image embedding
        num_captions: Number of captions per image in dataset
        load_features: Boolean for creating or loading image features
        load_captions: Boolean for creating or loading image captions
        preload: Boolean for either preloading data
           into RAM during construction
        model: base model for encoderCNN
        num_workers: Dataloader parameter

    Return:
        data_loader: A torch dataloader for the MSR-VTT dataset

    """
    # Ensure specified mode is validate
    try:
        assert mode in ['train', 'dev', 'test']
    except AssertionError:
        print('Invalid mode specified: {}'.format(mode))
        print(' Defaulting to dev mode')
        mode = 'dev'

    # Build dataset
    data = VideoDataset(mode, videos_path, vocab_path, captions_path,
                        batch_size, num_frames, max_len, embedding_size,
                        num_captions, load_features, load_captions, preload,
                        model)

    if mode == 'train':
        # Get all possible video indices
        indices = data.get_indices()

        # Initialize a sampler for the indices
        init_sampler = sampler.SubsetRandomSampler(indices=indices)

        # Create data loader with dataset and sampler
        data_loader = DataLoader(dataset=data,
                                 num_workers=num_workers,
                                 batch_sampler=sampler.BatchSampler(
                                     sampler=init_sampler,
                                     batch_size=batch_size,
                                     drop_last=False))
    else:
        data_loader = DataLoader(dataset=data,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=num_workers)
    return data_loader
Beispiel #2
0
def cifar10(datadir, greyscale=False, training_transforms=[], mode='train', transform=True, subset=None, **kwargs):

    assert mode in ['train', 'test', 'val']

    train=mode=='train'

    root = os.path.join(datadir,'cifar10')
    gtransf = [transforms.Grayscale()] if greyscale else []

    if mode in ['train','test']:
        if train and transform:

            tlist = [*gtransf,
                     transforms.RandomCrop(32,padding=4),
                     transforms.RandomHorizontalFlip(),
                    *training_transforms,
                     transforms.ToTensor()]
        else:
            tlist = [*gtransf,
                     transforms.ToTensor()]

        transform = transforms.Compose(tlist)

        ds = CIFAR10(root, download=True, train=train, transform=transform)
        sample_size = 50000 if train else 10000
    else:
        filename = 'cifar10.1'

        label_filename = 'cifar10.1_v6_labels.npy'
        imagedata_filename = 'cifar10.1_v6_data.npy'

        label_filepath = os.path.join(root, label_filename)
        imagedata_filepath = os.path.join(root, imagedata_filename)

        try:
            labels = np.load(label_filepath)
            data = np.load(imagedata_filepath)
        except FileNotFoundError as e:
            raise type(e)('Download CIFAR10.1 .npy files from https://github.com/modestyachts/CIFAR-10.1 '
                  'and place in %s'%root)

        ds = tnt.dataset.TensorDataset([data, labels])
        augment = transforms.ToTensor()
        ltrans  = lambda x: np.array(x, dtype=np.int_)
        ds = ds.transform({0:augment, 1:ltrans})
        sample_size=2000


    # check if we're looking at a range
    if isinstance(subset, range):
        indices = np.arange(subset.start, subset.stop)
    elif isinstance(subset, tuple) and len(subset)==2:
        indices = np.arange(subset[0], subset[1])
    elif isinstance(subset, np.ndarray):
        indices = subset
    elif isinstance(subset, float):
        if (subset > 0. and subset < 1.):
            num_samples = floor(subset  * sample_size)
            assert num_samples >0
            indices = np.random.choice(sample_size, num_samples)
        else:
            raise ValueError('subset fraction must be between 0 and 1')
    elif subset is not None:
        raise ValueError('Invalid subset parameter.')

    if subset:

        # according to Pytorch docs shuffle cannot be true if we are using a sampler
        # so we're going to turn it off in case that it's on
        kwargs['shuffle'] = False

        dataloader = th.utils.data.DataLoader(ds,
                sampler=sampler.SubsetRandomSampler(indices), **kwargs)
        dataloader.Nsamples = indices.size

    else:
        dataloader = th.utils.data.DataLoader(ds, **kwargs)
        if mode=='train':
            dataloader.Nsamples = 50000
        elif mode=='test':
            dataloader.Nsamples = 10000
        else:
            dataloader.Nsamples = 2000

    dataloader.classes = 10
    dataloader.image_shape = (3, 32, 32)


    return dataloader
Beispiel #3
0
    root=
    r'C:\Users\Administrator\Anaconda3\Lib\site-packages\torchvision\datasets',
    download=True,
    train=True,
    transform=transform)
# 地址前加r是因为\u对python来说是特殊字符,所以加一个r代表原生字符
test_data = torchvision.datasets.FashionMNIST(
    root=
    r'C:\Users\Administrator\Anaconda3\Lib\site-packages\torchvision\datasets',
    download=True,
    train=False,
    transform=transform)
# 对数据进行打包,分批
Loader_train = Dataloader(dataset=train_data,
                          batch_size=HP.Minibatch,
                          sampler=sampler.SubsetRandomSampler(range(500)))
Loader_val = Dataloader(dataset=train_data,
                        batch_size=HP.Minibatch,
                        sampler=sampler.SubsetRandomSampler(range(500, 700)))
Loader_test = Dataloader(dataset=test_data,
                         batch_size=HP.Minibatch,
                         sampler=sampler.SubsetRandomSampler(range(500)))

# 训练模型
cnnmodel = CNN_model()
fit = Fit(model=cnnmodel.model,
          HP=HP,
          train_data=Loader_train,
          val_data=Loader_val)
fit.train()
fit.predict(mode='test', loader=Loader_test)
Beispiel #4
0
parser = argparse.ArgumentParser(description='pytorch Dog VS Cat')
parser.add_argument('--mode', type=str, default='normal',  choices=['normal', 'transfer'], metavar='M',
                    help='use transfer learning or not (default: normal)')
parser.add_argument('--data', type=str, default='', metavar='D',
                    help='specify the data folder that contains both training set and test set')

args = parser.parse_args()
mode = args.mode
folder_path = args.data

total_size = 20000
val_size = 2000
train_size = total_size - val_size
test_size = 4000

train_sampler = sampler.SubsetRandomSampler(range(train_size))
val_sampler = sampler.SubsetRandomSampler(range(val_size))

train_path = folder_path + '/dog-training/*.tif'
test_path = folder_path + '/dog-test/*.tif'


def transfer():
    transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                    transforms.RandomCrop((224, 224)),
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    train_data = DogCat(train_path, transform=transform)  # 20000
    test_data = DogCat(test_path, transform=transform)  # 4000
Beispiel #5
0
def train(class_tag,model, train_data_set, save, n_epochs=3,
          batch_size=64, lr=0.001, wd=0.0001, momentum=0.9, seed=None, num=1,
          train_file=None):

    class_tag = "all_dset"
    if seed is not None:
        torch.manual_seed(seed)
    global THREADHOLD
    # # split data
    with open(train_file,"rb") as fp:
        train_list = pickle.load(fp)

    samples_num =len(train_list)
    split_num = int(configs.splite_rate * samples_num)
    data_index = train_list
    np.random.shuffle(data_index)
    train_index = data_index[:split_num]
    eval_index = data_index[split_num:]
    train_samples = sampler.SubsetRandomSampler(train_index)
    eval_samples = sampler.SubsetRandomSampler(eval_index)



    # Data loaders
    train_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size,
                                               sampler=train_samples, pin_memory=(torch.cuda.is_available()),
                                               num_workers=5, drop_last=False)
    valid_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size,
                                              sampler=eval_samples, pin_memory=(torch.cuda.is_available()),
                                               num_workers=5, drop_last=False)
    # Model on cuda
    if torch.cuda.is_available():
        model = model.cuda()

    # Wrap model for multi-GPUs, if necessary
    model_wrapper = model

    # Optimizer
    optimizer = torch.optim.Adam(model_wrapper.parameters(), lr=0.001)

    # Start log
    with open(os.path.join(save, 'DeepPPI_results.csv'), 'w') as f:
        f.write('epoch,loss,acc,F_value, precision,recall,auc,aupr,mcc,threadhold\n')

        # Train model
        best_F = 0
        threadhold = 0
        count = 0
        for epoch in range(n_epochs):
            _, train_loss = train_epoch(
                model=model_wrapper,
                loader=train_loader,
                optimizer=optimizer,
                epoch=epoch,
                all_epochs=n_epochs,
            )
            _, valid_loss, acc, f_max, p_max, r_max, auc, aupr,t_max,mcc= eval_epoch(
                model=model_wrapper,
                loader=valid_loader,
                is_test=(not valid_loader)
            )
    
            print(
            'epoch:%03d,valid_loss:%0.5f\nacc:%0.6f,F_value:%0.6f, precision:%0.6f,recall:%0.6f,auc:%0.6f,aupr:%0.6f,mcc:%0.6f,threadhold:%0.6f\n' % (
                (epoch + 1), valid_loss, acc, f_max, p_max, r_max,auc, aupr,mcc,t_max))
            if f_max > best_F:
                count = 0
                best_F = f_max
                THREADHOLD = t_max
                print("new best F_value:{0}(threadhold:{1})".format(f_max, THREADHOLD))
                torch.save(model.state_dict(), os.path.join(save, 'DeepPPI_model.dat'))
            else:
                count += 1
                if count>=5:
                    return None
            # Log results
            f.write('%03d,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f\n' % (
                (epoch + 1), valid_loss, acc, f_max, p_max, r_max, auc, aupr,mcc,t_max))
def train(clargs):
    metrics = collections.defaultdict(list)
    save_path = pathlib.Path(clargs.save_path)
    save_path.resolve()
    save_path = save_path.absolute()
    save_path.mkdir(parents=True, exist_ok=True)
    model_path = save_path / 'model'
    hyperparameters_path = save_path / 'hyperparameters'
    metrics_path = save_path / 'metrics'

    hyperparameters = dict()
    hyperparameters['clargs'] = vars(clargs)

    if clargs.pretrained:
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224,
                                              0.225])  # From Imagenet.
    else:
        # From iMaterialist.
        normalize = transforms.Normalize(mean=[0.6837, 0.6461, 0.6158],
                                         std=[0.2970, 0.3102, 0.3271])

    image_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomAffine(degrees=15,
                                scale=(1.0, 1.3),
                                resample=Image.BILINEAR,
                                fillcolor=2**24 - 1),
        # transforms.ColorJitter(brightness=0.2, contrast=0.8, saturation=0.8, hue=0.3),
        # transforms.RandomGrayscale(p=0.1),
        transforms.Resize(224),
        transforms.RandomCrop(224),
        transforms.ToTensor(),
        normalize
    ])

    training_data = datasets.ImageFolder('data/training',
                                         transform=image_transform)
    if clargs.training_subset < len(training_data):
        subset_indices = random.sample([i for i in range(len(training_data))],
                                       clargs.training_subset)
        training_data_sampler = sampler.SubsetRandomSampler(subset_indices)
    else:
        training_data_sampler = sampler.RandomSampler(training_data)
    training_data_loader = data.DataLoader(
        training_data,
        batch_size=clargs.training_batch_size,
        num_workers=clargs.num_workers,
        sampler=training_data_sampler)

    validation_data = datasets.ImageFolder('data/validation',
                                           transform=image_transform)
    validation_data_loader = data.DataLoader(
        validation_data,
        batch_size=clargs.validation_batch_size,
        num_workers=clargs.num_workers)

    network: models.VGG = models.vgg19_bn(pretrained=clargs.pretrained)
    network.classifier[6] = nn.Linear(network.classifier[6].in_features, 128)
    network.cuda()

    optimizer = optim.SGD(network.parameters(),
                          lr=clargs.learning_rate,
                          weight_decay=0.0001)
    hyperparameters['optimizer'] = optimizer.state_dict()
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               factor=0.5,
                                               patience=0,
                                               verbose=True,
                                               eps=0)

    training_loss_function = functools.partial(functional.cross_entropy,
                                               size_average=False)
    validation_loss_function = functools.partial(functional.cross_entropy,
                                                 size_average=False)

    torch.save(hyperparameters, str(hyperparameters_path))

    validation_stopwatch = stopwatch.Stopwatch()
    training_stopwatch = stopwatch.Stopwatch()
    saving_stopwatch = stopwatch.Stopwatch()
    best_validation_accuracy = -1.0
    with stopwatch.Stopwatch() as total_time_stopwatch, stopwatch.Stopwatch(
    ) as epoch_stopwatch:
        for i in range(clargs.epoch_limit):
            current_lr = optimizer.param_groups[0]['lr']
            if current_lr < 10**-8:
                break
            metrics['lr'].append(current_lr)

            LOGGER.debug('Training...')

            with training_stopwatch:
                epoch_loss_history, epoch_acccuracy_history = training.train(
                    training_data_loader,
                    network,
                    optimizer,
                    training_loss_function,
                    cuda=True,
                    progress_bar=(clargs.verbose >= 2))
                epoch_loss_history = [
                    batch_loss / clargs.training_batch_size
                    for batch_loss in epoch_loss_history
                ]
                training_stopwatch.lap()

            LOGGER.debug('Validating...')

            with validation_stopwatch:
                validation_loss, validation_accuracy = training.evaluate_loss_and_accuracy(
                    validation_data_loader,
                    network,
                    validation_loss_function,
                    cuda=True,
                    progress_bar=(clargs.verbose >= 2))
                validation_loss /= len(validation_data)
                validation_stopwatch.lap()

            metrics['training_loss'].extend(epoch_loss_history)
            metrics['validation_loss'].append(validation_loss)
            metrics['training_accuracy'].extend(epoch_acccuracy_history)
            metrics['validation_accuracy'].append(validation_accuracy)

            LOGGER.debug('Saving...')

            if validation_accuracy > best_validation_accuracy:
                best_validation_accuracy = validation_accuracy
                torch.save(network, str(model_path))
                saving_stopwatch.lap()

            torch.save(metrics, metrics_path)

            epoch_stopwatch.lap()

            print('epoch {epoch}\n'
                  '- total duration:                  {total_duration}\n'
                  '- validation loss:                 {validation_loss}\n'
                  '- validation accuracy:             {validation_accuracy}\n'
                  '- average training-batch loss:     {avg_training_loss}\n'
                  '- average training-batch accuracy: {avg_training_accuracy}'.
                  format(epoch=i,
                         total_duration=epoch_stopwatch.lap_times()[-1],
                         validation_loss=validation_loss,
                         validation_accuracy=validation_accuracy,
                         avg_training_loss=sum(epoch_loss_history) /
                         len(epoch_loss_history),
                         avg_training_accuracy=sum(epoch_acccuracy_history) /
                         len(epoch_acccuracy_history)))
            print(
                '- training duration:              {training_duration}\n'
                '- validation duration:            {validation_duration}'.
                format(
                    training_duration=training_stopwatch.lap_times()[-1],
                    validation_duration=validation_stopwatch.lap_times()[-1]))

            if not clargs.constant_learning_rate:
                scheduler.step(1.0 - validation_accuracy)

        total_time_stopwatch.lap()

    print('total_time={}'.format(total_time_stopwatch.lap_times()[-1]))
Beispiel #7
0
        return len(self.annas)


# %% Create 'Dataset's and 'DataLoader's

transform = T.Compose([
    T.Resize((input_size, input_size)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

if __name__ == '__main__':
    dset_train = BoardLocalization(data_dir, anna_name, transform=transform)
    loader_train = DataLoader(dset_train,
                              batch_size=batch_size,
                              sampler=sampler.SubsetRandomSampler(
                                  range(num_train)))

    dset_val = BoardLocalization(data_dir, anna_name, transform=transform)
    loader_val = DataLoader(dset_val,
                            batch_size=batch_size,
                            sampler=sampler.SubsetRandomSampler(
                                range(num_train, num_total)))

# %% Utility functions


def smooth_L1(x, dim=0):
    """
    Inputs:
        - x: Tensor of size 4xN (by default) or size Nx4 (when dim=1)
    Returns:
Beispiel #8
0
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

brats_data = BraTSDataset(data_dir)
num_examples = len(brats_data)
data_indices = np.arange(num_examples)
deterministic_test = True
# Fix stochasticity in data sampling
if deterministic_test:
    np.random.seed(0)

# TODO: Doesn't really seem to belong here. Make a new
# class for handling this or push it to the dataloader?
np.random.shuffle(data_indices)
split_idx = int(num_examples * train_split)
test_sampler = sampler.SubsetRandomSampler(data_indices[split_idx:])
testloader = DataLoader(brats_data, batch_size=1, sampler=test_sampler)

model = BraTSSegmentation(input_channels=2)

# TODO: continue training from checkpoint
checkpoint = torch.load('checkpoints/test')
model.load_state_dict(checkpoint['model_state_dict'])

model = model.to(device)

model.eval()
sum_test_dice = 0.0
with torch.no_grad():
    for test_ex in tqdm(testloader):
        test_src, test_target = test_ex
Beispiel #9
0
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

from model.DeepFM import DeepFM
from data.dataset import CriteoDataset

# 900000 items for training, 10000 items for valid, of all 1000000 items
Num_train = 9000

# load data
train_data = CriteoDataset('./data', train=True)
loader_train = DataLoader(train_data,
                          batch_size=100,
                          sampler=sampler.SubsetRandomSampler(
                              range(Num_train)))
val_data = CriteoDataset('./data', train=True)
loader_val = DataLoader(val_data,
                        batch_size=100,
                        sampler=sampler.SubsetRandomSampler(
                            range(Num_train, 10000)))

feature_sizes = np.loadtxt('./data/feature_sizes.txt', delimiter=',')
feature_sizes = [int(x) for x in feature_sizes]
print(feature_sizes)

model = DeepFM(feature_sizes, use_cuda=False)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0)
model.fit(loader_train, loader_val, optimizer, epochs=5, verbose=True)
Beispiel #10
0
# Split validation set randomly
def split_indices(n, val_percent):
    # Calculate the size of validation set
    n_val = int(n*val_percent)
    # Create random permutation of 0 to n-1
    indices_random = np.random.permutation(n)
    # return train_dataset, val_dataset
    return indices_random[n_val:], indices_random[:n_val]


train_ds, val_ds = split_indices(len(tensor_dataset), val_percent=0.2)

batch_size = 100
# Train sampler and data loader
train_sampler = sampler.SubsetRandomSampler(train_ds)
train_dl = DataLoader(dataset=tensor_dataset,
                      batch_size=batch_size,
                      sampler=train_sampler)

# Validation sampler and data loader
val_sampler = sampler.SubsetRandomSampler(val_ds)
val_dl = DataLoader(tensor_dataset,
                    batch_size,
                    sampler=val_sampler)

class MNISTModel(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
    def train(self,
              data,
              BATCH=64,
              VAL=0.1,
              LR=1e-4,
              MAX_EPOCH=10,
              PRE=False,
              NAME='checkpoint',
              THRESHOLD=(0.001, 0.001)):

        if PRE:
            train_threshold = self.pretrain_threshold
            valid_threshold = self.prevalid_threshold
            name = NAME + '_pretrain'
        else:
            train_threshold, valid_threshold = THRESHOLD
            name = NAME + '_' + str(LR)

        NUM, LEN, DIM = data.shape
        VAL = int(NUM * VAL)
        loader_train = DataLoader(data,
                                  batch_size=BATCH,
                                  sampler=sampler.SubsetRandomSampler(
                                      range(NUM - VAL)))
        loader_valid = DataLoader(data,
                                  batch_size=BATCH,
                                  sampler=sampler.SubsetRandomSampler(
                                      range(NUM - VAL, NUM)))
        optimizer = torch.optim.Adam([{
            'params': self.encoder.parameters()
        }, {
            'params': self.decoder.parameters()
        }, {
            'params': self.fc.parameters(),
            'lr': 1e-4
        }],
                                     lr=LR,
                                     betas=(0.5, 0.999))

        train_loss_history = []
        valid_loss_history = []

        for e in range(MAX_EPOCH):
            print("train epoch " + str(e) + " starts")
            count = 0
            train_loss = 0
            valid_loss = 0
            for i, batch in enumerate(loader_train):

                self.encoder.train()
                self.decoder.train()
                self.fc.train()

                N = batch.shape[0]
                batch = batch.to(device=self.device, dtype=self.dtype)

                batch_recon = self.forward(batch, N)
                loss = torch.nn.functional.mse_loss(
                    batch_recon,
                    batch.contiguous().view(-1, DIM))
                train_loss += loss.item()
                count += 1
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if i % 100 == 0:
                    print('Iteration %d, train loss = %.4f' % (i, loss.item()))
            train_loss /= count
            count = 0
            for i, batch in enumerate(loader_valid):

                self.encoder.eval()
                self.decoder.eval()
                self.fc.eval()

                with torch.no_grad():
                    N = batch.shape[0]
                    batch = batch.to(device=self.device, dtype=self.dtype)

                    batch_recon = self.forward(batch, N)
                    loss = torch.nn.functional.mse_loss(
                        batch_recon,
                        batch.contiguous().view(-1, DIM))
                    valid_loss += loss.item()
                    count += 1

                if i % 100 == 0:
                    print('Iteration %d, validation loss = %.4f' %
                          (i, loss.item()))

            valid_loss /= count

            train_loss_history.append(train_loss)
            valid_loss_history.append(valid_loss)

            print("train_loss ", train_loss)
            print("valid_loss ", valid_loss)
            if (train_loss < train_threshold and valid_loss < valid_threshold):
                break

        self.save_model(name)
        return train_loss_history, valid_loss_history
Beispiel #12
0
def get_data(dataset, batch_size, _seed, validate, data_dir, shuffle=False):
    validation_split = 10_000
    kwargs = {'num_workers': 16, 'pin_memory': True}
    if dataset == "MNIST":
        transform = transforms.Compose([transforms.ToTensor()])
        # , transforms.Normalize((0.1307,), (0.3081,))]
        train_set = datasets.MNIST(root=data_dir,
                                   train=True,
                                   download=True,
                                   transform=transform)
        test_set = datasets.MNIST(root=data_dir,
                                  train=False,
                                  download=True,
                                  transform=transform)
        num_train = len(train_set)
        indices = list(range(num_train))
        if not validate:
            train_sampler = sampler.SubsetRandomSampler(indices)
            test_loader = DataLoader(test_set,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     **kwargs)
        else:
            np.random.seed(_seed)
            np.random.shuffle(indices)
            # import pdb; pdb.set_trace()
            train_idx, valid_idx, = indices[0:-validation_split], indices[
                -validation_split:]
            train_sampler = sampler.SubsetRandomSampler(train_idx)
            test_sampler = sampler.SubsetRandomSampler(valid_idx)
            test_loader = DataLoader(train_set,
                                     batch_size=batch_size,
                                     sampler=test_sampler,
                                     **kwargs)

    elif dataset in ["CIFAR10", "CIFAR10Augmented"
                     ]:  # TODO: copy data augmentation from Madry's paper
        # transform = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.ToTensor()])
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.49137255, 0.48235294, 0.44666667),
                                 (0.24705882, 0.24352941, 0.26156863)),
        ])
        if dataset == "CIFAR10":
            train_set = datasets.CIFAR10(root=data_dir,
                                         train=True,
                                         download=True,
                                         transform=transform)
        elif dataset == "CIFAR10Augmented":
            # augmented_transform = transforms.Compose([transforms.Grayscale(num_output_channels=1),
            #                                           transforms.RandomCrop(32, padding=4),
            #                                           transforms.RandomHorizontalFlip(),
            #                                           transforms.ToTensor(),])
            augmented_transform = transforms.Compose([
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize((0.49137255, 0.48235294, 0.44666667),
                                     (0.24705882, 0.24352941, 0.26156863)),
            ])
            train_set = datasets.CIFAR10(root=data_dir,
                                         train=True,
                                         download=True,
                                         transform=augmented_transform)

        test_set = datasets.CIFAR10(root=data_dir,
                                    train=False,
                                    download=True,
                                    transform=transform)
        num_train = len(train_set)
        indices = list(range(num_train))
        if not validate:
            train_sampler = sampler.SubsetRandomSampler(indices)
            test_loader = DataLoader(test_set,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     **kwargs)
        else:
            np.random.seed(_seed)
            np.random.shuffle(indices)
            train_idx, valid_idx, = indices[0:-validation_split], indices[
                -validation_split:]
            train_sampler = sampler.SubsetRandomSampler(train_idx)
            test_sampler = sampler.SubsetRandomSampler(valid_idx)
            test_loader = DataLoader(train_set,
                                     batch_size=batch_size,
                                     sampler=test_sampler,
                                     **kwargs)

    else:
        raise NotImplementedError

    train_loader = DataLoader(train_set,
                              batch_size=batch_size,
                              sampler=train_sampler,
                              **kwargs)
    full_loader = DataLoader(train_set,
                             batch_size=num_train,
                             sampler=train_sampler,
                             **kwargs,
                             shuffle=shuffle)

    return train_loader, test_loader, full_loader
Beispiel #13
0
        print(self.csv_data.size())
        print(self.label_data.size())

    def __len__(self):
        return len(self.csv_data)

    def __getitem__(self, idx):
        data = (self.csv_data[idx], self.label_data[idx])
        return data


dataset = RideHailingDataset("data.csv", "label.csv")
dataset_size = len(dataset)
print(dataset[0])

indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

train_sampler = sampler.SubsetRandomSampler(train_indices)
valid_sampler = sampler.SubsetRandomSampler(val_indices)

train_loader = DataLoader(dataset,
                          batch_size=batch_size,
                          sampler=train_sampler)
validation_loader = DataLoader(dataset,
                               batch_size=batch_size,
                               sampler=valid_sampler)
Beispiel #14
0
import torch.utils.data as data
import torch.utils.data.sampler as sam
import torch
import warnings
from os.path import dirname, abspath, join

warnings.filterwarnings("ignore")
torch.manual_seed(12345)

LEARNING_RATE = 0.01
BATCH_SIZE = 1
NUM_TRAINING_SAMPLES = 10
NUM_TESTING_SAMPLES = 6
NUM_VAL_SAMPLES = 5

train_sampler = sam.SubsetRandomSampler(
    np.arange(NUM_TRAINING_SAMPLES, dtype=np.int64))
test_sampler = sam.SubsetRandomSampler(
    np.arange(NUM_TESTING_SAMPLES, dtype=np.int64))
val_sampler = sam.SubsetRandomSampler(
    np.arange(NUM_TRAINING_SAMPLES,
              NUM_VAL_SAMPLES + NUM_TRAINING_SAMPLES,
              dtype=np.int64))

current_folder = dirname(abspath(__file__))
model_folder = join(current_folder, "saved_models")


# net with 2 convolutions, and a binary output
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
Beispiel #15
0
TRAIN_DATA_SIZE = 50000
TRAIN_EPOCH = 30 #10000
BATCH_SIZE = 128

#%%
if True and __name__=='__main__':
    '''
    Load MNIST data
    '''
    mnist_transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
    mnist_train = torchvision.datasets.MNIST(root='./data', train=True, transform=mnist_transforms, download=True)
    mnist_test = torchvision.datasets.MNIST(root='./data', train=False, transform=mnist_transforms, download=True)
    indices = list(range(len(mnist_train)))
    np.random.shuffle(indices)
    train_idx, valid_idx = indices[:TRAIN_DATA_SIZE], indices[TRAIN_DATA_SIZE:]
    train_sampler = sampler.SubsetRandomSampler(train_idx)
    valid_sampler = sampler.SubsetRandomSampler(valid_idx)
    train_data_loader = torch.utils.data.DataLoader(
        mnist_train, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=10)
    valid_data_loader = torch.utils.data.DataLoader(
        mnist_train, batch_size=BATCH_SIZE,  sampler=valid_sampler, num_workers=10)
    test_data_loader = torch.utils.data.DataLoader(mnist_test, batch_size=BATCH_SIZE, shuffle=True, num_workers=10)
    print('Loaded MNIST data, total',len(mnist_train)+len(mnist_test))

    normlist=[]
    for x,_ in train_data_loader:
        x=x.view(len(x),-1)
        normlist.append(torch.mean(torch.norm(x,2,1)))
    print('Mean of Mnist norm (C2) =',torch.mean(torch.Tensor(normlist)))

#%%
Beispiel #16
0
if CUDA:
    dtype = torch.cuda.FloatTensor
    double_type = torch.cuda.LongTensor
else:
    dtype = torch.FloatTensor
    double_type = torch.LongTensor

# Load the data and format them
data = dataset_loader.PeptideSequence(FILENAME)
idx = np.arange(len(data))
np.random.seed(0)
train_data = idx[:(int)(len(data) * TRAIN_RATIO)]
test_data = idx[(int)(len(data) * TRAIN_RATIO):]
np.random.shuffle(train_data)
np.random.shuffle(test_data)
train_sampler = sampler.SubsetRandomSampler(train_data)
test_sampler = sampler.SubsetRandomSampler(test_data)
train_loader = dataloader.DataLoader(data, batch_size=args.batch_size, sampler=train_sampler)
test_loader = dataloader.DataLoader(data, batch_size=args.batch_size, sampler=test_sampler)

model = model_cnn_tagger.CNN_tagger(N_INPUT, N_OUTPUT1, args.n_kernel, args.kernel_dim)

weight = torch.cuda.FloatTensor(2)
weight[0] = 0.5
weight[1] = 0.5

loss_function_imbalanced = nn.CrossEntropyLoss(weight)
optimizer = optim.RMSprop(model.parameters(), lr=args.lr)

if CUDA:
    model.cuda()
Beispiel #17
0
    device = torch.device('cpu')
dtype = torch.float32

transform = T.Compose([
    T.ToTensor(),
    T.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
    Pad(size=(3, 36, 36)),
    RandomCrop(size=(32, 32)),
    RandomFlip(h=True),
])

dataset = cifar10('./cifar-10-batches-py', transform=transform)

loader_train = DataLoader(dataset,
                          batch_size=BATCH_SIZE,
                          sampler=sampler.SubsetRandomSampler(
                              range(NUM_TRAIN)))
loader_train_test = DataLoader(dataset,
                               batch_size=BATCH_SIZE,
                               sampler=sampler.SubsetRandomSampler(
                                   random.sample(range(NUM_TRAIN), 1000)))
loader_val = DataLoader(dataset,
                        batch_size=BATCH_SIZE,
                        sampler=sampler.SubsetRandomSampler(
                            range(NUM_TRAIN, 50000)))
loader_test = DataLoader(dataset,
                         batch_size=BATCH_SIZE,
                         sampler=sampler.SubsetRandomSampler(
                             range(50000, 60000)))

dropout = 0
hidden1 = 128
    def initialize(self, opts, use_cuda, rounds):
        print('=> loading TinyImageNet data...')
        normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276])    #####
        kwargs = {'num_workers': 2, 'pin_memory': True} if use_cuda else {}
        train_dataset = datasets.ImageFolder(self.root_path_train,
            transform=transforms.Compose([
                transforms.Resize(size=(32,32),interpolation=2),
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]))

        # change labels
        train_labels = [train_dataset.imgs[i][1] for i in range(train_dataset.__len__())]
        train_indices = getSubset4(train_labels, 4*rounds, 4*rounds+1, 4*rounds+2, 4*rounds+3)

        # print('train_indices:',len(train_indices))
        for ind in train_indices:
            if train_labels[ind]==4*rounds:     
                train_dataset.imgs[ind]=tuple_modified(train_dataset.imgs[ind],0)
            elif train_labels[ind]==4*rounds+1: 
                train_dataset.imgs[ind]=tuple_modified(train_dataset.imgs[ind],1)
            elif train_labels[ind]==4*rounds+2: 
                train_dataset.imgs[ind]=tuple_modified(train_dataset.imgs[ind],2)
            else:
                train_dataset.imgs[ind]=tuple_modified(train_dataset.imgs[ind],3)

        trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=opts.batch_size, sampler =S.SubsetRandomSampler(train_indices), **kwargs)    

        test_dataset = datasets.ImageFolder(self.root_path_test,
            transform=transforms.Compose([
                transforms.Resize(size=(32,32),interpolation=2),
                transforms.ToTensor(),
                normalize,
            ]))
        test_labels = [test_dataset.imgs[i][1] for i in range(test_dataset.__len__())]
        test_indices = getSubset4(test_labels, 4*rounds, 4*rounds+1, 4*rounds+2, 4*rounds+3)
        
        for ind in test_indices:
            if test_labels[ind]==4*rounds:            ###
                test_dataset.imgs[ind]=tuple_modified(test_dataset.imgs[ind],0)
            elif test_labels[ind]==4*rounds+1: 
                test_dataset.imgs[ind]=tuple_modified(test_dataset.imgs[ind],1)
            elif test_labels[ind]==4*rounds+2: 
                test_dataset.imgs[ind]=tuple_modified(test_dataset.imgs[ind],2)
            else:
                test_dataset.imgs[ind]=tuple_modified(test_dataset.imgs[ind],3)

        testloader = torch.utils.data.DataLoader(test_dataset, batch_size=100, sampler =S.SubsetRandomSampler(test_indices), **kwargs)        
        return trainloader, testloader
Beispiel #19
0
def policy_training(device='cuda'):
    noiseset = [35, 45, 55]
    # set random seed for all gpu related things
    seed_torch(seed=args.seed)

    # load msdnet and the data
    model = DnCNN_DS(channels=1, num_of_layers=args.num_of_layers)

    model = torch.nn.DataParallel(model).cuda()

    if os.path.exists(os.path.join(args.outf, 'net.pth')):
        print('Loading denoise model...')
        model.load_state_dict(torch.load(os.path.join(args.outf, 'net.pth')))
    else:
        print('Need the classification model!')
        return

    # need to augment the validation set to generate training set for PolicyNet
    print('Loading dataset ...\n')

    # load the original image instead
    # dataset_train = Dataset(train=True, data_folder=args.data_folder)
    # total_train = len(dataset_train)
    # val_size = int(total_train*0.2)
    # print("Training data for policynet: ", val_size)
    # # load indices file
    # indices = np.load(os.path.join(args.outf, 'indices.npy'))
    # val_idx = indices[:val_size]
    # train_idx = indices[val_size:]
    # train_loader = DataLoader(dataset=dataset_train, num_workers=args.num_workers,
    #     sampler=sampler.SubsetRandomSampler(train_idx),
    #     batch_size=args.batch_size, shuffle=False)
    # val_loader = DataLoader(dataset=dataset_train, num_workers=args.num_workers,
    #     sampler=sampler.SubsetRandomSampler(val_idx),
    #     batch_size=args.batch_size, shuffle=False)

    # load the original test data
    dataset_train = load_imgs('train')
    total_train = len(dataset_train)
    val_size = int(total_train * args.val_ratio)
    indices = list(range(total_train))
    random.Random(0).shuffle(indices)
    np.save(os.path.join(args.outf, 'policy_train_indices.npy'),
            np.array(indices))
    val_idx = indices[:val_size]
    train_idx = indices[val_size:]
    train_loader = DataLoader(dataset=dataset_train,
                              num_workers=args.num_workers,
                              sampler=sampler.SubsetRandomSampler(train_idx),
                              batch_size=args.batch_size,
                              shuffle=False)
    val_loader = DataLoader(dataset=dataset_train,
                            num_workers=args.num_workers,
                            sampler=sampler.SubsetRandomSampler(val_idx),
                            batch_size=1,
                            shuffle=False)
    print('Training data size: ', len(train_loader.dataset))
    # print('Validation data size: ', len(val_loader.dataset))

    dataset_val = Dataset(train=False)
    test_loader_12 = DataLoader(dataset=dataset_val,
                                num_workers=4,
                                batch_size=1,
                                shuffle=False)
    # use Set68 as testdataset
    dataset_test = load_imgs('Set68')
    test_loader = DataLoader(dataset=dataset_test,
                             num_workers=4,
                             batch_size=1,
                             shuffle=False)

    # need to construct the policy network and train the policy net.
    # the architecture of the policy network need to be designed.

    ######################################
    # need to think about the model of policynet
    ######################################
    model.eval()
    p_true_all = list()
    psnr_all = list()
    np.random.seed(seed=args.seed)
    test_noiseL = np.random.choice(noiseset, size=len(val_loader.dataset))
    # print(test_noiseL)
    print('Average noise level: ', np.average(test_noiseL))
    for i, batch in enumerate(val_loader):
        # for i in range(1):
        #     batch = next(iter(train_loader))
        data = batch
        data = data.cuda()
        noise = torch.zeros(data.size())
        noise = torch.FloatTensor(data.size()).normal_(
            mean=0,
            std=test_noiseL[i] / 255.,
            generator=torch.manual_seed(args.seed))
        noise = noise.cuda()

        with torch.no_grad():
            outputs = model(data + noise)
            p_true, mse_all = PolicyKL.true_posterior(args, outputs, noise)
        p_true_all.append(p_true)

    #     psnrs = list()
    #     for pred in outputs:
    #         psnr = batch_PSNR(torch.clamp(data+noise-pred, 0., 1.),
    #             data, 1.)
    #         psnrs.append(psnr)
    #     psnr_all.append(np.array(psnrs))
    # psnr_all = np.stack(psnr_all)

    p_true = torch.cat(p_true_all, dim=0)
    p_det = max_onehot(p_true, dim=-1, device=device)
    p_true = torch.mean(p_true, dim=0)
    # find positions with nonzero posterior
    p_det_index = torch.argmax(p_det, dim=1)
    print(Counter(list(p_det_index.cpu().numpy())))
    p_det = torch.mean(p_det, dim=0)
    train_post = {}
    nz_post = {}
    i = 0
    for t in range(len(outputs)):
        if p_det[t] > 0.001:
            # if p_det[t] > -1:
            train_post[i] = t
            nz_post[i] = t
            i += 1
    del train_post[i - 1]

    p_str = 'val p true:['
    p_str += ','.join(['%0.3f' % p_true[t] for t in nz_post.values()])
    print(p_str + ']')

    p_str = 'val p true det:['
    p_str += ','.join(['%0.3f' % p_det[t] for t in nz_post.values()])
    print(p_str + ']')

    print(nz_post)
    ######################################

    # initialize nets with nonzero posterior
    if args.policy_type == 'multiclass':
        score_net = MulticlassNet(args, nz_post, 1)
    elif args.policy_type == 'sequential':
        score_net = MulticlassNet(args, train_post, 1)
    else:
        print('Model not implemented!!')
        return
    score_net = torch.nn.DataParallel(score_net)
    score_net = score_net.cuda()
    # pdb.set_trace()

    if args.restart and os.path.exists(
            os.path.join(args.outf, '{}_policy_net.dump'.format(
                args.policy_type))):
        print('Loading previous policynet model...')
        dump = os.path.join(args.outf,
                            '{}_policy_net.dump'.format(args.policy_type))
        score_net.load_state_dict(torch.load(dump))

    # train
    if args.phase == 'train':

        # start training
        optimizer = optim.Adam(list(score_net.parameters()),
                               lr=1e-3,
                               weight_decay=args.weight_decay)
        milestones = [10, 20, 40, 60, 80]
        # gammas = [0.4, 0.2, 0.2, 0.2, 0.2]
        gammas = [1, 1, 1, 1, 1]
        scheduler = MultiStepMultiLR(optimizer,
                                     milestones=milestones,
                                     gammas=gammas)
        trainer = PolicyKL(args=args,
                           model=model,
                           score_net=score_net,
                           train_post=train_post,
                           nz_post=nz_post,
                           optimizer=optimizer,
                           train_loader=train_loader,
                           val_loader=val_loader,
                           test_loader=test_loader,
                           device=device,
                           scheduler=scheduler)
        trainer.train()
    # test
    dump = os.path.join(args.outf,
                        '{}_policy_net.dump'.format(args.policy_type))
    score_net.load_state_dict(torch.load(dump))

    PolicyKL.test(args=args,
                  score_net=score_net,
                  model=model,
                  data_loader=test_loader,
                  nz_post=nz_post,
                  device=device,
                  noiseset=[75])
    print(args.outf)
def get_data_loader(
    transform: tv.transforms,
    caption_file: str,
    image_id_file: str,
    image_folder: str,
    config: Config,
    vocab_file: str,
    mode: str = "train",
    batch_size: int = 1,
    vocab_threshold=None,
    start_word: str = "<start>",
    end_word: str = "<end>",
    unk_word: str = "<unk>",
    vocab_from_file: bool = True,
    num_workers: int = 0,
):
    """Returns the data loader

    :param transform: [description]
    :type transform: tv.transforms
    :param mode: [description], defaults to "train"
    :type mode: str, optional
    :param batch_size: [description], defaults to 1
    :type batch_size: int, optional
    :param vocab_threshold: [description], defaults to None
    :type vocab_threshold: [type], optional
    :param vocab_file: [description], defaults to "output/vocab.pkl"
    :type vocab_file: str, optional
    :param start_word: [description], defaults to "<start>"
    :type start_word: str, optional
    :param end_word: [description], defaults to "<end>"
    :type end_word: str, optional
    :param unk_word: [description], defaults to "<unk>"
    :type unk_word: str, optional
    :param vocab_from_file: [description], defaults to True
    :type vocab_from_file: bool, optional
    :param num_workers: [description], defaults to 0
    :type num_workers: int, optional
    
    """

    assert mode in [
        "train",
        "validation",
        "test",
    ], f"mode: '{mode}' must be one of ['train','validation','test']"
    if vocab_from_file == False:
        assert (
            mode == "train"
        ), f"mode: '{mode}', but to generate vocab from caption file, mode must be 'train' "

    if mode == "train":
        if vocab_from_file == True:
            assert os.path.exists(
                vocab_file
            ), "vocab_file does not exist.  Change vocab_from_file to False to create vocab_file."
        assert image_id_file.find(
            "train"
        ), f"double check image_id_file: {image_id_file}. File name should have the substring 'train'"
        assert os.path.exists(
            image_id_file
        ), f"image id file: {image_id_file} doesn't not exist."
        assert os.path.exists(
            caption_file
        ), f"caption file: {caption_file} doesn't not exist."
        assert os.path.isdir(
            config.IMAGE_DATA_DIR
        ), f"{config.IMAGE_DATA_DIR} not a directory"
        assert (
            len(os.listdir(config.IMAGE_DATA_DIR)) != 0
        ), f"{config.IMAGE_DATA_DIR} is empty."

    if mode == "validation":
        assert image_id_file.find(
            "dev"
        ), f"double check image_id_file: {image_id_file}. File name should have the substring 'dev' "
        assert os.path.exists(
            image_id_file
        ), f"image id file: {image_id_file} doesn't not exist."
        assert os.path.exists(
            caption_file
        ), f"caption file: {caption_file} doesn't not exist."
        assert os.path.isdir(
            config.IMAGE_DATA_DIR
        ), f"{config.IMAGE_DATA_DIR} not a directory"
        assert (
            len(os.listdir(config.IMAGE_DATA_DIR)) != 0
        ), f"{config.IMAGE_DATA_DIR} is empty."
        assert os.path.exists(
            vocab_file
        ), f"Must first generate {vocab_file} from training data."
        assert vocab_from_file == True, "Change vocab_from_file to True."

    if mode == "test":
        assert (
            batch_size == 1
        ), "Please change batch_size to 1 if testing your model."
        assert image_id_file.find(
            "test"
        ), f"double check image_id_file: {image_id_file}. File name should have the substring 'test'"
        assert os.path.exists(
            vocab_file
        ), f"Must first generate {vocab_file} from training data."
        assert vocab_from_file == True, "Change vocab_from_file to True."

    img_folder = config.IMAGE_DATA_DIR
    annotations_file = caption_file

    # image caption dataset
    dataset = FlickrDataset(
        transform,
        mode,
        batch_size,
        vocab_threshold,
        vocab_file,
        start_word,
        end_word,
        unk_word,
        caption_file,
        image_id_file,
        vocab_from_file,
        image_folder,
    )

    if mode in ["train", "validation"]:
        # Randomly sample a caption length, and sample indices with that length.
        indices = dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        initial_sampler = sampler.SubsetRandomSampler(indices=indices)
        # data loader for COCO dataset.
        data_loader = DataLoader(
            dataset=dataset,
            num_workers=num_workers,
            batch_sampler=sampler.BatchSampler(
                sampler=initial_sampler,
                batch_size=dataset.batch_size,
                drop_last=False,
            ),
        )
    else:
        data_loader = DataLoader(
            dataset=dataset,
            batch_size=dataset.batch_size,
            shuffle=False,
            num_workers=num_workers,
        )

    return data_loader
    dataset = utils.NumpyDataset(data_shifted, labels, transform=utils.NumpyToTensor())

    # Run the experiments
    for seed in seeds:
        # Data loaders
        logger.info('Split data with seed {}'.format(seed))
        torch.manual_seed(seed)
        np.random.seed(seed)
        train_indices = []
        val_indices = []
        for cls in np.unique(labels):
            indices = np.where(labels == cls)
            indices = np.random.permutation(indices[0])
            train_indices.append(indices[:int(len(indices) * (1 - validating_ratio))])
            val_indices.append(indices[int(len(indices) * (1 - validating_ratio)):])
        train_set_sampler = sampler.SubsetRandomSampler(np.concatenate(train_indices))
        validating_set_sampler = sampler.SubsetRandomSampler(np.concatenate(val_indices))
        train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_set_sampler, num_workers=8)
        val_loader = DataLoader(dataset, batch_size=batch_size, sampler=validating_set_sampler, num_workers=8)

        # TensorboardX writer
        writer = SummaryWriter(main_directory + '/runs/' + experiment_name + '_' + str(seed))

        # The model
        torch.manual_seed(0)
        model = WideNetMasked(len(class_names)).to(device)
        logger.info('Net parameters number : {}'.format(utils.compute_total_parameter_number(model)))

        optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9, weight_decay=0.001)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 100, 150], gamma=0.1)
Beispiel #22
0
    T.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_data = MammogramDataset("Mini_DDSM_Upload", "train", transform=transform)
test_data = MammogramDataset("Mini_DDSM_Upload", "test")

VAL_RATIO = 0.2
NUM_VAL = int(len(train_data) * VAL_RATIO)
NUM_TRAIN = len(train_data) - NUM_VAL
NUM_TEST = len(test_data)
BATCH_SIZE = batch_size

loader_train = DataLoader(train_data,
                          batch_size=BATCH_SIZE,
                          sampler=sampler.SubsetRandomSampler(
                              range(NUM_TRAIN)),
                          drop_last=True)
loader_val = DataLoader(train_data,
                        batch_size=BATCH_SIZE,
                        sampler=sampler.SubsetRandomSampler(
                            range(NUM_TRAIN, NUM_TRAIN + NUM_VAL)))
loader_test = DataLoader(test_data, batch_size=BATCH_SIZE)

dtype = torch.float32
device = torch.device('cpu')

epoch = 0
loss_list = []
val_acc_list = []
best_accuracy = 0.0
Beispiel #23
0
    def weighted_loss_train(self,
                            optimizer,
                            BATCH=64,
                            MAX_EPOCH=30,
                            USE_WEIGHT=True):
        VAL = self.val_data.shape[0]
        self.model.train()

        train_loss_history = []
        val_loss_history = []
        train_recall_history = []
        val_recall_history = []

        for e in range(MAX_EPOCH):
            epoch_train_loss = 0
            epoch_train_recall = 0

            print("training epoch " + str(e) + " starts")

            if USE_WEIGHT:
                self.eval(BATCH)
                weighted_index = self.weighted_sampler.sample_train_data(size)

            print(data.shape)

            NUM = data.shape[0]

            loader_train = DataLoader(data,
                                      batch_size=BATCH,
                                      sampler=sampler.SubsetRandomSampler(
                                          range(NUM)))
            loader_valid = DataLoader(self.val_data,
                                      batch_size=BATCH,
                                      sampler=sampler.SubsetRandomSampler(
                                          range(VAL)))

            train_iter = 0
            for i, batch in enumerate(loader_train):
                X = (batch[:, :-2]).to(device=self.device,
                                       dtype=self.data_dtype)
                y = (batch[:, -2]).to(device=self.device,
                                      dtype=self.label_dtype)
                scores = self.model(X)
                accuracy = np.argmax(scores.cpu().detach().numpy(),
                                     axis=1) - y.cpu().detach().numpy()
                mask = np.where(accuracy == 0)
                accuracy = len(mask[0]) / X.shape[0]
                epoch_train_recall += accuracy
                loss = torch.nn.functional.cross_entropy(scores, y)

                epoch_train_loss += loss.item()
                if i % 100 == 0:
                    print("epoch is ", e, ", training loss is ", loss.item())
                    print("epoch is ", e, ", training accuracy is ", accuracy)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                train_iter += 1

            train_loss_history.append(epoch_train_loss / train_iter)
            train_recall_history.append(epoch_train_recall / train_iter)

            epoch_val_loss = 0
            epoch_val_recall = 0

            val_iter = 0
            for i, batch in enumerate(loader_valid):
                self.model.eval()  # put model to evaluation mode
                X = (batch[:, :-1]).to(device=self.device,
                                       dtype=self.data_dtype)
                y = (batch[:, -1]).to(device=self.device,
                                      dtype=self.label_dtype)
                with torch.no_grad():
                    scores = self.model(X)
                    accuracy = np.argmax(scores.cpu().detach().numpy(),
                                         axis=1) - y.cpu().detach().numpy()
                    mask = np.where(accuracy == 0)
                    accuracy = len(mask[0]) / X.shape[0]
                    epoch_val_recall += accuracy
                    loss = torch.nn.functional.cross_entropy(scores, y)
                    epoch_val_loss += loss.item()
                    if i % 100 == 0:
                        print("epoch is ", e, ", validation loss is ",
                              loss.item())
                        print("epoch is ", e, ", validation accuracy is ",
                              accuracy)
                    val_iter += 1

            val_loss_history.append(epoch_val_loss / val_iter)
            val_recall_history.append(epoch_val_recall / val_iter)

        return train_loss_history, train_recall_history, val_loss_history, val_recall_history
Beispiel #24
0
                T.RandomHorizontalFlip(),
                T.RandomVerticalFlip(),
                T.ToTensor()
            ])

if augment:
    train_data = MammogramDataset("data", "train", transform=transform)
else:
    train_data = MammogramDataset("data", "train")
test_data = MammogramDataset("data", "test")

NUM_VAL = int(len(train_data)*VAL_RATIO)
NUM_TRAIN = len(train_data) - NUM_VAL
NUM_TEST = len(test_data)

loader_train = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))
loader_val = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 
                                                                                              NUM_TRAIN + NUM_VAL)))
loader_test = DataLoader(test_data, batch_size=BATCH_SIZE)
loader_tiny_train = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=sampler.SubsetRandomSampler(range(101)))
loader_tiny_val = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=sampler.SubsetRandomSampler(range(100, 200)))

dtype = torch.float32
if USE_GPU and torch.cuda.is_available(): #Determine whether or not to use GPU
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('using device:', device)


'''
Beispiel #25
0
import torchvision.transforms as T

import numpy as np

#Preprocessing Data 
#Substracting the mean RGB value and dividing by the standard deviation of each RGB value

NUM_TRAIN = 49000

transform = T.Compose([
								T.ToTensor(),
								T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
						])

cifar10_train = dset.CIFAR10('./datasets', train=True, download=True,transform=transform)
loader_train = DataLoader(cifar10_train, batch_size=64, sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))
cifar10_val = dset.CIFAR10('./datasets', train=True, download=True,transform=transform)
loader_val = DataLoader(cifar10_val, batch_size=64, sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 50000)))
cifar10_test = dset.CIFAR10('./datasets', train=False, download=True, transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)

#Use GPU device 

USE_GPU = True

dtype = torch.float32 

if USE_GPU and torch.cuda.is_available():
	device = torch.device('cuda')
else:
	device = torch.device('cpu')
Beispiel #26
0
# The torchvision.transforms package provides tools for preprocessing data
# and for performing data augmentation; here we set up a transform to
# preprocess the data by subtracting the mean RGB value and dividing by the
# standard deviation of each RGB value; we've hardcoded the mean and std.
transform = T.Compose([T.ToTensor()])

# We set up a Dataset object for each split (train / val / test); Datasets load
# training examples one at a time, so we wrap each Dataset in a DataLoader which
# iterates through the Dataset and forms minibatches. We divide the CIFAR-10
# training set into train and val sets by passing a Sampler object to the
# DataLoader telling how it should sample from the underlying Dataset.
cbow_train = CBOWDataset('text8.txt', 4)
loader_train = DataLoader(cbow_train,
                          batch_size=64,
                          sampler=sampler.SubsetRandomSampler(
                              range(int(len(cbow_train) * 0.8))))
loader_val = DataLoader(cbow_train,
                        batch_size=64,
                        sampler=sampler.SubsetRandomSampler(
                            range(int(len(cbow_train) * 0.8),
                                  int(len(cbow_train) * 0.9))))
loader_test = DataLoader(cbow_train,
                         batch_size=64,
                         sampler=sampler.SubsetRandomSampler(
                             range(int(len(cbow_train) * 0.9),
                                   len(cbow_train))))
USE_GPU = True

dtype = torch.float32  # we will be using float throughout this tutorial

if USE_GPU and torch.cuda.is_available():
Beispiel #27
0
CF_start_time = time.time()
model = CF_knearest(dataset)
print(model.predict_score(dataset, 1, 3671))
CF_end_time = time.time()
print("the end of training CF, and the time consumption: %d" %
      (CF_end_time - CF_start_time))

# val_data = PreProcessData("./data/ratings_small.csv", train=True)
# loader_val = DataLoader(train_data, batch_size=50, sampler=sampler.SubsetRandomSampler(range(TRAIN_ROUND, 100000)))

feature_size = FM_train_data.feature_size
print("feature_size is " + str(feature_size))
loader_train = DataLoader(FM_train_data,
                          batch_size=50,
                          sampler=sampler.SubsetRandomSampler(
                              range(FM_TRAIN_DATA_NUMBER)))
"""
FM
"""

FM_start_time = time.time()
FM_model = FM(feature_sizes=feature_size)
print("Now, lets train the model")
FM_model.fit(loader_train, epochs=50)
FM_end_time = time.time()
print("the end of training FM, time consume: %d" %
      (FM_end_time - FM_start_time))
"""
DeepFM
"""
deepFM_start_time = time.time()
Beispiel #28
0
def create_dataloader(loader_type, args_data, do_use_gpu):
    """
    Create a data loader according to the parameters
    """

    kwargs = {
        'num_workers': args_data.num_loader_workers,
        'pin_memory': True
    } if do_use_gpu else {}

    if args_data.dataset_type == DatasetType.NYU:
        if loader_type == LoaderMode.TRAIN:
            # Set up sample IDs to sample from
            ids_train = np.arange(args_data.id_start_train,
                                  args_data.id_end_train + 1)
            ids_train_permuted = args_data.rng.permutation(ids_train)
            ids_train_labeled = ids_train_permuted[:args_data.
                                                   num_labeled_samples]
            ids_train_unlabeled = ids_train_permuted[args_data.
                                                     num_labeled_samples:]
            # Ensure a minimum sampling probability for labeled samples
            ratio_labeled = len(ids_train_labeled) / float(len(ids_train))
            prob_labeled = max(args_data.min_sampling_prob_labeled,
                               ratio_labeled)
            prob_unlabeled = 1.0 - prob_labeled
            # Set up distribution/weights to sample from (considering un-/labeled samples)
            scale_weights = float(
                len(ids_train))  # value to which weights will sum up
            sample_weight_labeled = prob_labeled * scale_weights / float(
                len(ids_train_labeled))
            sample_weight_unlabeled = prob_unlabeled * scale_weights \
                                        / float(len(ids_train_unlabeled)) \
                                        if len(ids_train_unlabeled) > 0 else 0.0
            sampling_weights = np.zeros((args_data.num_all_samples_train))
            sampling_weights[ids_train_labeled] = sample_weight_labeled
            sampling_weights[ids_train_unlabeled] = sample_weight_unlabeled
            num_samples_used_for_train = np.count_nonzero(sampling_weights)

            loader = torch.utils.data.DataLoader(
                NyuHandPoseMultiViewDataset(
                    args_data.nyu_data_basepath,
                    train=True,
                    cropSize=args_data.in_crop_size,
                    doJitterCom=args_data.do_jitter_com,
                    sigmaCom=args_data.sigma_com,
                    doAddWhiteNoise=args_data.do_add_white_noise,
                    sigmaNoise=args_data.sigma_noise,
                    doLoadRealSamples=args_data.do_use_real_samples,
                    unlabeledSampleIds=ids_train_unlabeled,
                    transform=transforms.ToTensor(),
                    useCache=args_data.use_pickled_cache,
                    cacheDir=args_data.nyu_data_basepath_pickled,
                    annoType=args_data.anno_type,
                    neededCamIds=args_data.needed_cam_ids_train,
                    randomSeed=args_data.seed,
                    cropSize3D=args_data.crop_size_3d_tuple,
                    args_data=args_data),
                batch_size=args_data.batch_size,
                sampler=smpl.WeightedRandomSampler(
                    sampling_weights,
                    num_samples=num_samples_used_for_train,
                    replacement=True),
                **kwargs)

            print("Using {} samples for training".format(
                num_samples_used_for_train))
            if sample_weight_labeled > 0.:
                print("  {} labeled".format(len(ids_train_labeled)))
            if sample_weight_unlabeled > 0.:
                print("  {} unlabeled".format(len(ids_train_unlabeled)))

        elif loader_type == LoaderMode.VAL:
            num_samples_val = min(
                int(
                    round(args_data.max_val_train_ratio *
                          args_data.num_labeled_samples)),
                args_data.max_num_samples_val)
            ids_val = np.arange(args_data.id_start_val,
                                args_data.id_end_val + 1)
            ids_val = args_data.rng.permutation(ids_val)
            ids_val = ids_val[:num_samples_val]
            loader = torch.utils.data.DataLoader(
                NyuHandPoseMultiViewDataset(
                    args_data.nyu_data_basepath,
                    train=False,
                    cropSize=args_data.in_crop_size,
                    doJitterCom=args_data.do_jitter_com_test,
                    sigmaCom=args_data.sigma_com,
                    doAddWhiteNoise=args_data.do_add_white_noise_test,
                    sigmaNoise=args_data.sigma_noise,
                    doLoadRealSamples=args_data.do_use_real_samples,
                    transform=transforms.ToTensor(),
                    useCache=args_data.use_pickled_cache,
                    cacheDir=args_data.nyu_data_basepath_pickled,
                    annoType=args_data.anno_type,
                    neededCamIds=args_data.needed_cam_ids_test,
                    randomSeed=args_data.seed,
                    cropSize3D=args_data.crop_size_3d_tuple,
                    args_data=args_data),
                batch_size=args_data.batch_size,
                sampler=smpl.SubsetRandomSampler(ids_val),
                **kwargs)

            print("Using {} samples for validation".format(len(ids_val)))

        elif loader_type == LoaderMode.TEST:
            ids_test = np.arange(args_data.id_start_test,
                                 args_data.id_end_test + 1)
            loader = torch.utils.data.DataLoader(
                NyuHandPoseMultiViewDataset(
                    args_data.nyu_data_basepath,
                    train=False,
                    cropSize=args_data.in_crop_size,
                    doJitterCom=args_data.do_jitter_com_test,
                    sigmaCom=args_data.sigma_com,
                    doAddWhiteNoise=args_data.do_add_white_noise_test,
                    sigmaNoise=args_data.sigma_noise,
                    doLoadRealSamples=args_data.do_use_real_samples,
                    transform=transforms.ToTensor(),
                    useCache=args_data.use_pickled_cache,
                    cacheDir=args_data.nyu_data_basepath_pickled,
                    annoType=args_data.anno_type,
                    neededCamIds=args_data.needed_cam_ids_test,
                    randomSeed=args_data.seed,
                    cropSize3D=args_data.crop_size_3d_tuple,
                    args_data=args_data),
                batch_size=args_data.batch_size,
                sampler=smpl.SubsetRandomSampler(ids_test),
                **kwargs)

            print("Using {} samples for test".format(len(ids_test)))

        else:
            raise UserWarning("LoaderMode unknown.")

    elif args_data.dataset_type == DatasetType.ICG:
        args_data = set_loader_type_specific_settings_icg(
            args_data, loader_type)
        dataset = IcgHandPoseMultiViewDataset(args_data)
        if loader_type == LoaderMode.TRAIN:
            num_samples = len(dataset)
            num_samples_labeled_all = dataset.get_num_samples_labeled()
            num_samples_unlabeled = dataset.get_num_samples_unlabeled()
            num_samples_labeled_used = min(num_samples_labeled_all,
                                           args_data.num_labeled_samples)
            num_samples_used = num_samples_labeled_used + num_samples_unlabeled
            # Set up sample IDs to sample from
            ids_train = np.arange(num_samples)
            ids_train_labeled_all = ids_train[:num_samples_labeled_all]
            ids_train_labeled_perm = args_data.rng.permutation(
                ids_train_labeled_all)
            ids_train_labeled = ids_train_labeled_perm[:
                                                       num_samples_labeled_used]
            ids_train_unlabeled = ids_train[num_samples_labeled_all:]
            # Ensure a minimum sampling probability for labeled samples
            ratio_labeled = len(ids_train_labeled) / float(num_samples_used)
            prob_labeled = max(args_data.min_sampling_prob_labeled,
                               ratio_labeled)
            prob_unlabeled = 1.0 - prob_labeled
            # Set up distribution/weights to sample from (considering un-/labeled samples)
            scale_weights = float(
                num_samples_used)  # value to which weights will sum up
            sample_weight_labeled = prob_labeled * scale_weights / float(
                len(ids_train_labeled))
            sample_weight_unlabeled = prob_unlabeled * scale_weights \
                                         / float(len(ids_train_unlabeled)) \
                                         if len(ids_train_unlabeled) > 0 else 0.0
            sampling_weights = np.zeros(len(ids_train))
            sampling_weights[ids_train_labeled] = sample_weight_labeled
            sampling_weights[ids_train_unlabeled] = sample_weight_unlabeled
            num_samples_used_for_train = np.count_nonzero(sampling_weights)

            loader = torch.utils.data.DataLoader(
                dataset,
                batch_size=args_data.batch_size,
                sampler=smpl.WeightedRandomSampler(
                    sampling_weights,
                    num_samples=num_samples_used_for_train,
                    replacement=True),
                **kwargs)

            print("Using {} samples for training".format(
                num_samples_used_for_train))
            if sample_weight_labeled > 0.:
                print("  {} labeled".format(len(ids_train_labeled)))
            if sample_weight_unlabeled > 0.:
                print("  {} unlabeled".format(len(ids_train_unlabeled)))

        elif loader_type == LoaderMode.VAL:
            # Prepare val. sample IDs
            ids_val = np.arange(len(dataset))
            if args_data.do_val_only_with_labeled:
                num_samples_labeled_all = dataset.get_num_samples_labeled()
                ids_val = np.arange(num_samples_labeled_all)
            # Use subset?
            max_num_samples_val = int(
                round(args_data.max_val_train_ratio *
                      args_data.num_labeled_samples))
            num_samples_val = min(max_num_samples_val, len(ids_val))
            ids_val = args_data.rng.permutation(ids_val)
            ids_val = ids_val[:num_samples_val]

            loader = torch.utils.data.DataLoader(
                dataset,
                batch_size=args_data.batch_size,
                sampler=smpl.SubsetRandomSampler(ids_val),
                **kwargs)

            print("Using {} samples for validation".format(len(ids_val)))
            print(
                "  {} labeled (might be incorrect if a subset is used (e.g., wrt. train set size))"
                .format(dataset.get_num_samples_labeled()))
            if not args_data.do_val_only_with_labeled:
                print(
                    "  {} unlabeled (might be incorrect if a subset is used (e.g., wrt. train set size))"
                    .format(dataset.get_num_samples_unlabeled()))

        elif loader_type == LoaderMode.TEST:
            # Prepare test sample IDs
            ids_test = np.arange(len(dataset))
            if args_data.do_test_only_with_labeled:
                num_samples_labeled_all = dataset.get_num_samples_labeled()
                ids_test = np.arange(num_samples_labeled_all)

            loader = torch.utils.data.DataLoader(
                dataset,
                batch_size=args_data.batch_size,
                sampler=smpl.SubsetRandomSampler(ids_test),
                **kwargs)

            print("Using {} samples for test".format(len(ids_test)))
            print("  {} labeled".format(dataset.get_num_samples_labeled()))
            if not args_data.do_test_only_with_labeled:
                print("  {} unlabeled".format(
                    dataset.get_num_samples_unlabeled()))

    else:
        raise UserWarning("DatasetType unknown.")

    return loader
Beispiel #29
0
    T.ToTensor(),
    T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# We set up a Dataset object for each split (train / val / test); Datasets load
# training examples one at a time, so we wrap each Dataset in a DataLoader which
# iterates through the Dataset and forms mini batches. We divide the CIFAR-10
# training set into train and val sets by passing a Sampler object to the
# DataLoader telling how it should sample from the underlying Dataset.
cifar10_train = dset.CIFAR10(cifar10_dir,
                             train=True,
                             download=True,
                             transform=transform)
loader_train = DataLoader(cifar10_train,
                          batch_size=64,
                          sampler=sampler.SubsetRandomSampler(
                              range(NUM_TRAIN)))

cifar10_val = dset.CIFAR10(cifar10_dir,
                           train=True,
                           download=True,
                           transform=transform)
loader_val = DataLoader(cifar10_val,
                        batch_size=64,
                        sampler=sampler.SubsetRandomSampler(
                            range(NUM_TRAIN, 50000)))

cifar10_test = dset.CIFAR10(cifar10_dir,
                            train=False,
                            download=True,
                            transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)
Beispiel #30
0
def reset_dataloader(data_loader):
    """Reset sampler for dataloader."""
    indices = data_loader.dataset.get_indices()
    new_sampler = sampler.SubsetRandomSampler(indices=indices)
    data_loader.batch_sampler.sampler = new_sampler