Esempio n. 1
0
 def setup_data(self):
     transform = unrel.TRANSFORM
     # Initialize trainset
     self.trainset = data.Dataset(split='train',
                                  pairs='annotated',
                                  transform=transform)
     if self.opts.train_size:
         print('Using subset of %d from train_set' % self.opts.train_size)
         batch_sampler = sampler.SequentialSampler(
             range(self.opts.train_size))
     else:
         batch_sampler = None
     self.trainloader = data.FauxDataLoader(self.trainset,
                                            sampler=batch_sampler,
                                            batch_size=self.opts.batch_size)
     # Initialize testset
     if self.opts.do_validation:
         self.testset = data.Dataset(split='test',
                                     pairs='annotated',
                                     transform=transform)
         batch_sampler = sampler.BatchSampler(
             sampler.SequentialSampler(self.testset),
             self.opts.test_batch_size, False
         )  # make test set load without shuffling so that we can use Tyler's RecallEvaluator
         self.testloaders = [
             data.FauxDataLoader(self.testset, sampler=batch_sampler)
         ]
     else:
         print('No testset')
         self.testloaders = []
Esempio n. 2
0
def get_dataloaders(train_batchsize, val_batchsize):
  kwargs={
    'num_workers': 20,
    'pin_memory': True
  }
  input_size = INFO['model-info']['input-size']
  base = '{}/{}'.format(os.environ['datadir-base'], INFO['dataset'])
  normalize = T.Normalize(mean=INFO['dataset-info']['normalization']['mean'], std=INFO['dataset-info']['normalization']['std'])
  transform = {
    'train': T.Compose([
      T.Resize(tuple([int(x*(4/3)) for x in input_size])), # 放大
      T.RandomResizedCrop(input_size), # 随机裁剪后resize
      T.RandomHorizontalFlip(0.5), # 随机水平翻转
      T.RandomVerticalFlip(0.5), # 随机垂直翻转
      T.RandomApply([T.RandomRotation(90)], 0.5), # 随机旋转90/270度
      T.RandomApply([T.RandomRotation(180)], 0.25), # 随机旋转180度
      T.RandomApply([T.ColorJitter(brightness=np.random.random()/5+0.9)], 0.5), #随机调整图像亮度
      T.RandomApply([T.ColorJitter(contrast=np.random.random()/5+0.9)], 0.5), # 随机调整图像对比度
      T.RandomApply([T.ColorJitter(saturation=np.random.random()/5+0.9)], 0.5), # 随机调整图像饱和度
      T.ToTensor(),
      normalize
    ]), 
    'val': T.Compose([
      T.Resize(input_size), # 放大
      T.ToTensor(),
      normalize
    ])
  }
  train_dset = dset.ImageFolder('{}/{}'.format(base, 'Train'), transform=transform['train'])
  train4val_dset = dset.ImageFolder('{}/{}'.format(base, 'Train'), transform=transform['val'])
  val_dset = dset.ImageFolder('{}/{}'.format(base, 'Val'), transform=transform['val'])

  labels = torch.from_numpy(np.array(train_dset.imgs)[:, 1].astype(int))
  num_of_images_by_class = torch.zeros(len(train_dset.classes))
  for i in range(len(train_dset.classes)):
    num_of_images_by_class[i] = torch.where(labels == i, torch.ones_like(labels), torch.zeros_like(labels)).sum().item()

  mapping = {}
  for c in train_dset.classes:
    if c in val_dset.classes:
      mapping[train_dset.class_to_idx[c]] = val_dset.class_to_idx[c]
    else:
      mapping[train_dset.class_to_idx[c]] = val_dset.class_to_idx['UNKNOWN']
  mapping[-1] = val_dset.class_to_idx['UNKNOWN']

  train_len = train_dset.__len__()
  val_len = val_dset.__len__()

  train_loader = DataLoader(train_dset, batch_size=train_batchsize, sampler=sampler.RandomSampler(range(train_len)), **kwargs)
  train4val_loader = DataLoader(train4val_dset, batch_size=val_batchsize, sampler=sampler.SequentialSampler(range(train_len)), **kwargs)
  val_loader = DataLoader(val_dset, batch_size=val_batchsize, sampler=sampler.SequentialSampler(range(val_len)), **kwargs)

  imgs = np.array(val_dset.imgs)

  return train_loader, train4val_loader, val_loader, num_of_images_by_class, mapping, imgs
Esempio n. 3
0
def get_dataloaders(train_batchsize, val_batchsize):
    kwargs = {'num_workers': 20, 'pin_memory': True}
    input_size = INFO['model-info']['input-size']
    base = '{}/{}'.format(os.environ['datadir-base'], INFO['dataset'])
    normalize = T.Normalize(mean=INFO['dataset-info']['normalization']['mean'],
                            std=INFO['dataset-info']['normalization']['std'])
    transform = {
        'val':
        T.Compose([
            T.Resize(608),
            T.RandomResizedCrop(456),
            # T.RandomCrop(456),
            T.ToTensor(),
            normalize
        ])
    }
    val_dset = dset.ImageFolder('{}/{}'.format(base, 'Val'),
                                transform=transform['val'])
    val_len = val_dset.__len__()

    val_loader = DataLoader(val_dset,
                            batch_size=val_batchsize,
                            sampler=sampler.SequentialSampler(range(val_len)),
                            **kwargs)

    return None, None, val_loader, None, None, None
 def __init__(self, opt):
     super().__init__(daemon=True)
     dataset_classes = get_dataset_classes(opt)
     if len(dataset_classes) > 1:
         datasets = []
         for class_name, collate_fn, task_name in dataset_classes:
             opt['pytorch_teacher_task'] = task_name
             opt['task'] = task_name
             datasets.append(class_name(opt))
             self.collate = collate_fn
         self.dataset = ParlAIConcatDataset(datasets)
     else:
         class_name, self.collate, task_name = dataset_classes[0]
         self.dataset = class_name(opt)
     self.bsz = opt.get('batchsize', 1)
     self.num_workers = opt.get('num_workers', 4)
     self.dataloader = DataLoader(
         self.dataset,
         batch_size=self.bsz,
         shuffle=False,
         sampler=sampler.SequentialSampler(self.dataset),
         num_workers=self.num_workers,
         collate_fn=self.collate,
         pin_memory=False,
         drop_last=False,
     )
     self.datatype = opt.get('datatype')
     self.data = enumerate(self.dataloader)
     self.batch_sort = opt.get('pytorch_teacher_batch_sort')
     self.batch_cache_type = opt.get('batch_sort_cache_type')
     self.batch_length_range = opt.get('batch_length_range')
     self.batch_sort_field = opt.get('batch_sort_field')
Esempio n. 5
0
def evaluate_model(model,
                   dataset,
                   classes,
                   examples=None,
                   batch_size=16,
                   dtype=torch.float32,
                   device=DEFAULT_DEVICE):
    examples = min(examples, len(dataset)) or len(dataset)
    model.eval()
    loader = DataLoader(dataset,
                        batch_size=batch_size,
                        sampler=sampler.SequentialSampler(
                            range(examples or len(dataset))))

    stats = zero_statistics(len(classes))

    for x, y in loader:
        x = x.to(device=device, dtype=dtype)
        y = y.to(device=device, dtype=torch.long)

        scores = model(x).cpu().numpy()
        predictions = scores.argmax(axis=1)
        stats = combine(
            stats,
            compute_prediction_statistics(predictions,
                                          y.cpu().numpy(), classes))

    return evaluate(stats)
def get_subm_link(criterion):
    test_dataset = XRayDataset(f'{data_folder}test.csv',
                               f'{data_folder}',
                               transform=tfms,
                               is_train=False)

    test_samplers = {'test': sampler.SequentialSampler(test_dataset)}
    test_dataloaders = {
        'test':
        DataLoader(test_dataset,
                   batch_size=32,
                   sampler=test_samplers['test'],
                   num_workers=8,
                   pin_memory=True)
    }
    test_dt_szs = get_dt_szs(test_samplers)

    # use_gpu = False
    criterion = criterion
    t_pdted, t_lbs = predict(dropout_model, 'test', test_dataloaders,
                             test_dt_szs)

    print(np.bincount(t_pdted))

    test_df = pd.read_csv(f'{data_folder}test.csv')
    test_df.head()

    test_df['detected'] = pd.Series([
        transformed_dataset.idx_to_classes[i] for i in t_pdted
    ]).astype('category')

    test_df.drop(['age', 'gender', 'view_position', 'image_name'],
                 axis=1).to_csv('sdir/fst.csv', index=False)

    return FileLink('./sdir/fst.csv')
Esempio n. 7
0
    def __init__(self, opt, shared=None):
        opt['batch_sort'] = False
        super().__init__(opt, shared)
        self.use_batch_act = self.bsz > 1
        self.num_workers = opt['numworkers']
        # One can specify a collate function to use for preparing a batch
        collate_fn = opt.get('collate_fn', default_collate)
        if not shared:
            self.dataset = StreamDataset(opt)
            self.pytorch_dataloader = DataLoader(
                self.dataset,
                batch_size=self.bsz,
                shuffle=False,
                sampler=sampler.SequentialSampler(self.dataset),
                num_workers=self.num_workers,
                collate_fn=collate_fn,
                pin_memory=False,
                drop_last=False,
            )
            self.lastYs = [None] * self.bsz
        else:
            self.dataset = shared['dataset']
            self.pytorch_dataloader = shared['pytorch_dataloader']
            self.lastYs = shared['lastYs']

        self.num_batches = math.ceil(self.dataset.num_examples() / self.bsz)
        self.reset()
Esempio n. 8
0
def get_dataloaders(train_batchsize, val_batchsize):
    kwargs = {'num_workers': 20, 'pin_memory': True}
    input_size = INFO['model-info']['input-size']
    base = '{}/{}'.format(os.environ['datadir-base'], INFO['dataset'])

    train_dset = dset.ImageFolder('{}/{}'.format(base, 'Train'))
    val_dset = dset.ImageFolder('{}/{}'.format(base, 'Val'))

    labels = torch.from_numpy(np.array(train_dset.imgs)[:, 1].astype(int))

    mapping = {}
    for c in train_dset.classes:
        if c in val_dset.classes:
            mapping[train_dset.class_to_idx[c]] = val_dset.class_to_idx[c]
        else:
            mapping[
                train_dset.class_to_idx[c]] = val_dset.class_to_idx['UNKNOWN']
    mapping[-1] = val_dset.class_to_idx['UNKNOWN']

    train_len = train_dset.__len__()
    val_len = val_dset.__len__()

    val_loader = DataLoader(val_dset,
                            batch_size=val_batchsize,
                            sampler=sampler.SequentialSampler(range(val_len)),
                            **kwargs)

    imgs = np.array(val_dset.imgs)

    return None, None, val_loader, None, mapping, imgs
Esempio n. 9
0
    def make_loader(self,
                    batch_size=16,
                    num_workers=0,
                    shuffle=False,
                    pin_memory=False,
                    resize_rate=10,
                    drop_last=False):
        """
        CommandLine:
            python ~/code/netharn/examples/yolo_voc.py YoloVOCDataset.make_loader

        Example:
            >>> # DISABLE_DOCTSET
            >>> torch.random.manual_seed(0)
            >>> self = YoloVOCDataset(split='train')
            >>> self.augmenter = None
            >>> loader = self.make_loader(batch_size=1, shuffle=True)
            >>> # training batches should have multiple shapes
            >>> shapes = set()
            >>> for batch in ub.ProgIter(iter(loader), total=len(loader)):
            >>>     inputs, labels = batch
            >>>     # test to see multiscale works
            >>>     shapes.add(inputs.shape[-1])
            >>>     if len(shapes) > 1:
            >>>         break
            >>> assert len(shapes) > 1
        """
        import torch.utils.data.sampler as torch_sampler
        assert len(self) > 0, 'must have some data'
        if shuffle:
            sampler = torch_sampler.RandomSampler(self)
            resample_freq = resize_rate
        else:
            sampler = torch_sampler.SequentialSampler(self)
            resample_freq = None

        # use custom sampler that does multiscale training
        batch_sampler = multiscale_batch_sampler.MultiScaleBatchSampler(
            sampler,
            batch_size=batch_size,
            resample_freq=resample_freq,
            drop_last=drop_last,
        )
        # torch.utils.data.sampler.WeightedRandomSampler
        loader = torch_data.DataLoader(
            self,
            batch_sampler=batch_sampler,
            collate_fn=nh.data.collate.padded_collate,
            num_workers=num_workers,
            pin_memory=pin_memory)
        if loader.batch_size != batch_size:
            try:
                # Hack: ensure dataloader has batch size attr
                loader._DataLoader__initialized = False
                loader.batch_size = batch_size
                loader._DataLoader__initialized = True
            except Exception:
                pass
        return loader
Esempio n. 10
0
def run_oof_binary(args,
                   session_backup,
                   read_img,
                   read_mask,
                   img_group_id_colname=None):
    metadata = session_backup[f'metadata'][0]
    if img_group_id_colname is not None:
        for group_name, _ in metadata.groupby(by=img_group_id_colname):
            os.makedirs(os.path.join(args.snapshots_root, args.snapshot,
                                     'oof_inference', group_name),
                        exist_ok=True)
    else:
        os.makedirs(os.path.join(args.snapshots_root, args.snapshot,
                                 'oof_inference'),
                    exist_ok=True)

    for fold_id, _, val_set in session_backup['cv_split'][0]:
        print(colored('====> ', 'green') + f'Loading fold [{fold_id}]')
        net = load_fold(args, fold_id)

        if args.tta:
            raise NotImplementedError('TTA is not yet supported')

        val_dataset = SegmentationDataset(
            split=val_set,
            trf=session_backup['val_trf'][0],
            read_img=read_img,
            read_mask=read_mask,
            img_group_id_colname=img_group_id_colname)

        val_loader = DataLoader(val_dataset,
                                batch_size=args.bs,
                                num_workers=args.n_threads,
                                sampler=sampler.SequentialSampler(val_dataset))

        with torch.no_grad():
            for batch in tqdm(val_loader,
                              total=len(val_loader),
                              desc=f'Predicting fold {fold_id}:'):
                img = batch['img']
                if img_group_id_colname is not None:
                    group_ids = batch['group_id']
                else:
                    group_ids = None
                fnames = batch['fname']
                predicts = torch.sigmoid(
                    net(img)).mul(255).to('cpu').numpy().astype(np.uint8)

                for idx, fname in enumerate(fnames):
                    pred_mask = predicts[idx].squeeze()
                    if img_group_id_colname is not None:
                        cv2.imwrite(
                            os.path.join(args.snapshots_root, args.snapshot,
                                         'oof_inference', group_ids[idx],
                                         fname), pred_mask)
                    else:
                        cv2.imwrite(
                            os.path.join(args.snapshots_root, args.snapshot,
                                         'oof_inference', fname), pred_mask)
Esempio n. 11
0
def get_simple_loader(dataset, batch_size=1):
    kwargs = {'num_workers': 4} if device.type == "cuda" else {}
    loader = DataLoader(dataset,
                        batch_size=batch_size,
                        sampler=sampler.SequentialSampler(dataset),
                        collate_fn=collate_MIL,
                        **kwargs)
    return loader
Esempio n. 12
0
    def make_loader(self, batch_size=16, num_workers=0, shuffle=False,
                    pin_memory=False):
        """
        Example:
            >>> torch.random.manual_seed(0)
            >>> dset = coco_api.CocoDataset(coco_api.demo_coco_data())
            >>> self = YoloCocoDataset(dset, train=1)
            >>> loader = self.make_loader(batch_size=1)
            >>> train_iter = iter(loader)
            >>> # training batches should have multiple shapes
            >>> shapes = set()
            >>> for batch in train_iter:
            >>>     shapes.add(batch[0].shape[-1])
            >>>     if len(shapes) > 1:
            >>>         break
            >>> #assert len(shapes) > 1

            >>> vali_loader = iter(loaders['vali'])
            >>> vali_iter = iter(loaders['vali'])
            >>> # vali batches should have one shape
            >>> shapes = set()
            >>> for batch, _ in zip(vali_iter, [1, 2, 3, 4]):
            >>>     shapes.add(batch[0].shape[-1])
            >>> assert len(shapes) == 1
        """
        assert len(self) > 0, 'must have some data'
        if shuffle:
            if True:
                # If the data is not balanced we need to balance it
                index_to_weight = self._training_sample_weights()
                num_samples = len(self)
                index_to_weight = index_to_weight[:num_samples]
                sampler = torch_sampler.WeightedRandomSampler(index_to_weight,
                                                              num_samples,
                                                              replacement=True)
                sampler.data_source = self  # hack for use with multiscale
            else:
                sampler = torch_sampler.RandomSampler(self)
            resample_freq = 10
        else:
            sampler = torch_sampler.SequentialSampler(self)
            resample_freq = None

        # use custom sampler that does multiscale training
        batch_sampler = multiscale_batch_sampler.MultiScaleBatchSampler(
            sampler, batch_size=batch_size, resample_freq=resample_freq,
        )
        # torch.utils.data.sampler.WeightedRandomSampler
        loader = torch_data.DataLoader(self, batch_sampler=batch_sampler,
                                       collate_fn=nh.data.collate.padded_collate,
                                       num_workers=num_workers,
                                       pin_memory=pin_memory)
        if loader.batch_size != batch_size:
            try:
                loader.batch_size = batch_size
            except Exception:
                pass
        return loader
Esempio n. 13
0
def data_sampler(dataset, shuffle, distributed):
    if distributed:
        return DistributedSampler(dataset, shuffle=shuffle)

    if shuffle:
        return sampler.RandomSampler(dataset)

    else:
        return sampler.SequentialSampler(dataset)
Esempio n. 14
0
def main():

    data_dir = sys.argv[1]
    hero2ix_dir = sys.argv[2]

    # import DataFrame and hero2ix dictionary
    heroes_df = pd.read_csv(data_dir, index_col=0)
    hero2ix_df = pd.read_csv(hero2ix_dir, index_col=0)
    heroes_df = heroes_df.dropna().reset_index(drop=True)
    hero2ix = dict(zip(hero2ix_df.hero, hero2ix_df.ID))
    # heroes = hero2ix_df['hero'].values

    # train test split
    split = int(len(heroes_df)*0.9)
    heroes_train = heroes_df.iloc[:split]
    heroes_test = heroes_df.iloc[split:]

    # build dataset generator
    train_gen = DataFrameIterator(heroes_train, hero2ix)
    test_gen = DataFrameIterator(heroes_test, hero2ix)

    # Use Dataloader class in pytorch to generate batched data
    batch_size = 16
    loader_train = DataLoader(train_gen, batch_size=batch_size,
                              sampler=sampler.RandomSampler(train_gen),
                              num_workers=4)
    loader_test = DataLoader(test_gen, batch_size=batch_size,
                              sampler=sampler.SequentialSampler(test_gen),
                              num_workers=4)

    # define model, totally three models in hetor2vec.py
    model = CBOH(embedding_dim=10, heropool_size=len(hero2ix))

    # define loss function
    loss_function = nn.CrossEntropyLoss()

    # run train
    losses = train(model=model, dataloader=loader_train, loss_function=loss_function,
                   init_lr=0.1, epochs=20, lr_decay_epoch=8, print_epoch=2, gpu=False)

    # check test accuracy
    print('accuracy: ', accuracy(model, dataloader=loader_test,
                                 batch_size=batch_size, gpu=False))

    # save embeddings as numpy arrays
    output_dir = './output/hero/hero_embeddings.npy'
    save_embeddings(model, filename=output_dir)

    # pickle model
    pickle_dir = './output/hero/model.p'
    pickle.dump(obj=model, file=open(pickle_dir, 'wb'))

    # plot loss vs epoch
    plot_loss(losses, './output/hero/loss_hitory.png')

    # project embeddings to 2d plane
    plot_embeddings(model, hero2ix)
Esempio n. 15
0
    def __init__(self, opt, shared=None):
        opt['batch_sort'] = False
        super().__init__(opt, shared)
        self.use_batch_act = self.bsz > 1
        self.num_workers = opt['numworkers']
        self.batch_sort = opt.get('pytorch_teacher_batch_sort')
        self.batch_cache_type = opt.get('batch_sort_cache')
        self.batch_sort_field = opt.get('batch_sort_field')
        # One can specify a collate function to use for preparing a batch
        self.opt = opt.copy()
        self.is_shared = shared is not None
        dataset_classes = self.get_dataset_class(opt)
        self.ordered = ('ordered' in self.datatype or
                        ('stream' in self.datatype and not opt.get('shuffle')))

        if not shared:
            if len(dataset_classes) > 1:
                datasets = []
                for class_name, collate_fn, task_name in dataset_classes:
                    opt['pytorch_teacher_task'] = task_name
                    opt['task'] = task_name
                    datasets.append(class_name(opt))
                    self.collate_fn = collate_fn
                self.dataset = ParlAIConcatDataset(datasets)
            else:
                class_name, self.collate_fn, task_name = dataset_classes[0]
                self.dataset = class_name(opt)
            if self.ordered or not self.training:
                data_sampler = sampler.SequentialSampler(self.dataset)
                pin_memory = False
            else:
                data_sampler = sampler.RandomSampler(self.dataset)
                pin_memory = True

            self.pytorch_dataloader = DataLoader(
                self.dataset,
                batch_size=self.bsz,
                sampler=data_sampler,
                num_workers=self.num_workers,
                collate_fn=self.collate_fn,
                pin_memory=pin_memory,
                drop_last=False,
            )

            self.lastYs = [None] * self.bsz
            if self.batch_sort:
                self.loader_process = LoaderProcess(opt)
                self.loader_process.start()
            self.data = enumerate(self.pytorch_dataloader)
        else:
            self.dataset = shared['dataset']
            self.pytorch_dataloader = shared['pytorch_dataloader']
            self.lastYs = shared['lastYs']
            self.data = shared['data']

        self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz)
        self.reset()
Esempio n. 16
0
 def __init__(self, dataset, batch_size, shuffle = True, drop_last = False):
     # buckets list 根据contexts长度分组
     self.buckets = bucket(dataset)  
     # 打乱 list
     if shuffle:
         np.random.shuffle(self.buckets)
         random_samplers = [sampler.RandomSampler(bucket) for bucket in self.buckets]
     else:
         random_samplers = [sampler.SequentialSampler(bucket) for bucket in self.buckets]
     self.sampler = [sampler.BatchSampler(s, batch_size, drop_last) for s in random_samplers]
def create_training_batch(train_data, batch_size):
    '''

    '''
    students = train_data.keys()
    stud_ids = [stud_id for stud_id in students]
    batches = list(
        sampler.BatchSampler(sampler.SequentialSampler(stud_ids),
                             batch_size=batch_size,
                             drop_last=False))
    batch_ids = []
    for batch in batches:
        batch_ids.append([stud_ids[i] for i in batch])
    return batch_ids
Esempio n. 18
0
 def __init__(self,
              data_source,
              shuffle=False,
              batch_size=16,
              drop_last=False,
              resample_frequency=10):
     if shuffle:
         self.sampler = torch_sampler.RandomSampler(data_source)
     else:
         self.sampler = torch_sampler.SequentialSampler(data_source)
     self.shuffle = shuffle
     self.batch_size = batch_size
     self.drop_last = drop_last
     self.num_scales = len(data_source.multi_scale_inp_size)
     self.resample_frequency = resample_frequency
Esempio n. 19
0
def create_dataloader(config, data, mode):
    dataset = create_dataset(config, data, mode)
    if mode == 'train':
        # create Sampler
        if dist.is_available() and dist.is_initialized():
            train_RandomSampler = distributed.DistributedSampler(dataset)
        else:
            train_RandomSampler = sampler.RandomSampler(dataset, replacement=False)

        train_BatchSampler = sampler.BatchSampler(train_RandomSampler,
                                              batch_size=config.train.batch_size,
                                              drop_last=config.train.dataloader.drop_last)

        # Augment
        collator = get_collate_fn(config)

        # DataLoader
        data_loader = DataLoader(dataset=dataset,
                                batch_sampler=train_BatchSampler,
                                collate_fn=collator,
                                pin_memory=config.train.dataloader.pin_memory,
                                num_workers=config.train.dataloader.work_nums)

    elif mode == 'val':
        if dist.is_available() and dist.is_initialized():
            val_SequentialSampler = distributed.DistributedSampler(dataset)
        else:
            val_SequentialSampler = sampler.SequentialSampler(dataset)

        val_BatchSampler = sampler.BatchSampler(val_SequentialSampler,
                                                batch_size=config.val.batch_size,
                                                drop_last=config.val.dataloader.drop_last)
        data_loader = DataLoader(dataset,
                                batch_sampler=val_BatchSampler,
                                pin_memory=config.val.dataloader.pin_memory,
                                num_workers=config.val.dataloader.work_nums)
    else:
        if dist.is_available() and dist.is_initialized():
            test_SequentialSampler = distributed.DistributedSampler(dataset)
        else:
            test_SequentialSampler = None

        data_loader = DataLoader(dataset,
                                 sampler=test_SequentialSampler,
                                 batch_size=config.test.batch_size,
                                 pin_memory=config.val.dataloader.pin_memory,
                                 num_workers=config.val.dataloader.work_nums)
    return data_loader
Esempio n. 20
0
    def __init__(self, opt, shared=None):
        opt['batch_sort'] = False
        super().__init__(opt, shared)
        self.use_batch_act = self.bsz > 1
        self.num_workers = opt['numworkers']
        self.batch_cache_type = opt.get('batch_sort_cache')
        # One can specify a collate function to use for preparing a batch
        self.opt = copy.deepcopy(opt)
        self.is_shared = shared is not None
        dataset_class, self.collate_fn = self.get_dataset_class(opt)
        opt['dataset_class'] = dataset_class
        opt['collate_fn'] = self.collate_fn

        if not shared:
            self.dataset = dataset_class(opt)
            if self.datatype == 'train' and not isinstance(
                    self.dataset, StreamDataset):
                data_sampler = sampler.RandomSampler(self.dataset)
            else:
                data_sampler = sampler.SequentialSampler(self.dataset)
            pin_memory = not isinstance(self.dataset, StreamDataset)
            self.pytorch_dataloader = DataLoader(
                self.dataset,
                batch_size=self.bsz,
                shuffle=False,
                sampler=data_sampler,
                num_workers=self.num_workers,
                collate_fn=self.collate_fn,
                pin_memory=pin_memory,
                drop_last=False,
            )
            self.lastYs = [None] * self.bsz
            if self.batch_cache_type != 'none':
                self.loader_process = LoaderProcess(opt)
                self.loader_process.start()
            self.data = enumerate(self.pytorch_dataloader)
        else:
            self.dataset = shared['dataset']
            self.pytorch_dataloader = shared['pytorch_dataloader']
            self.lastYs = shared['lastYs']
            self.data = shared['data']

        self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz)
        self.reset()
Esempio n. 21
0
 def __init__(self, opt):
     super().__init__(daemon=True)
     self.dataset = opt['dataset_class'](opt)
     self.bsz = opt.get('batchsize', 1)
     self.num_workers = opt.get('num_workers', 4)
     collate_fn = opt.get('collate_fn', default_collate)
     self.dataloader = DataLoader(
         self.dataset,
         batch_size=self.bsz,
         shuffle=False,
         sampler=sampler.SequentialSampler(self.dataset),
         num_workers=self.num_workers,
         collate_fn=collate_fn,
         pin_memory=False,
         drop_last=False,
         )
     self.datatype = opt.get('datatype')
     self.data = enumerate(self.dataloader)
     self.batch_cache_type = opt.get('batch_sort_cache')
     self.batch_length_range = opt.get('batch_length_range')
Esempio n. 22
0
    def make_train_valid_loaders(self,
                                 distributed=False
                                 ) -> Tuple[DataLoader, DataLoader]:
        train_dataset, valid_dataset = self.make_train_valid_datasets()

        train_weights = torch.DoubleTensor(
            [1.0] * len(train_dataset))  # uniform sampling
        train_sampler = sampler.WeightedRandomSampler(
            weights=train_weights,
            num_samples=self._data_params['batch_size'] *
            self._data_params['steps_per_epoch'],
        )
        train_loader = self._make_loader(train_dataset,
                                         train_sampler,
                                         mode='train',
                                         distributed=distributed)
        valid_loader = self._make_loader(
            valid_dataset,
            sampler.SequentialSampler(valid_dataset),
            mode='valid',
            distributed=distributed,
        )
        return train_loader, valid_loader
Esempio n. 23
0
def evaluate_pred(config):

    # define directories
    model_name = config.model

    test_data_root = config.data_root
    if config.deep_pred > 1:
        test_dir = config.test_dir + '/' + config.experiment_name + '/deep-pred{}/'.format(
            config.deep_pred) + model_name
    else:
        test_dir = config.test_dir + '/' + config.experiment_name + '/pred/' + model_name
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
    sample_dir = test_dir + '/samples'
    if not os.path.exists(sample_dir):
        os.makedirs(sample_dir)

    nframes_in = config.nframes_in
    nframes_pred = config.nframes_pred * config.deep_pred
    nframes = nframes_in + nframes_pred
    img_size = int(config.resl)
    nworkers = 4

    # load model
    if config.model == 'FutureGAN':
        ckpt = torch.load(config.model_path)
        # model structure
        G = ckpt['G_structure']
        # load model parameters
        G.load_state_dict(ckpt['state_dict'])
        G.eval()
        G = G.module.model
        if use_cuda:
            G = G.cuda()
        print(' ... loading FutureGAN`s FutureGenerator from checkpoint: {}'.
              format(config.model_path))

    # load test dataset
    transform = transforms.Compose([
        transforms.Resize(size=(img_size, img_size),
                          interpolation=Image.NEAREST),
        transforms.ToTensor(),
    ])
    if config.model == 'FutureGAN' or config.model == 'CopyLast':
        dataset_gt = VideoFolder(video_root=test_data_root,
                                 video_ext=config.ext,
                                 nframes=nframes,
                                 loader=video_loader,
                                 transform=transform)
        dataloader_gt = DataLoader(
            dataset=dataset_gt,
            batch_size=config.batch_size,
            sampler=sampler.SequentialSampler(dataset_gt),
            num_workers=nworkers)
    else:
        dataset_gt = VideoFolder(video_root=test_data_root + '/in_gt',
                                 video_ext=config.ext,
                                 nframes=nframes,
                                 loader=video_loader,
                                 transform=transform)
        dataset_pred = VideoFolder(video_root=test_data_root + '/in_pred',
                                   video_ext=config.ext,
                                   nframes=nframes,
                                   loader=video_loader,
                                   transform=transform)
        dataloader_pred = DataLoader(
            dataset=dataset_pred,
            batch_size=config.batch_size,
            sampler=sampler.SequentialSampler(dataset_pred),
            num_workers=nworkers)
        dataloader_gt = DataLoader(
            dataset=dataset_gt,
            batch_size=config.batch_size,
            sampler=sampler.SequentialSampler(dataset_gt),
            num_workers=nworkers)
        data_iter_pred = iter(dataloader_pred)
    test_len = len(dataset_gt)
    data_iter_gt = iter(dataloader_gt)

    # save model structure to file
    if config.model == 'FutureGAN':
        # count model parameters
        nparams_g = count_model_params(G)
        with open(
                test_dir +
                '/model_structure_{}x{}.txt'.format(img_size, img_size),
                'w') as f:
            print('--------------------------------------------------', file=f)
            print('Sequences in test dataset: ', len(dataset_gt), file=f)
            print('Number of model parameters: ', file=f)
            print(nparams_g, file=f)
            print('--------------------------------------------------', file=f)
            print('Model structure: ', file=f)
            print(G, file=f)
            print('--------------------------------------------------', file=f)
            print(
                ' ... FutureGAN`s FutureGenerator has been loaded successfully from checkpoint ... '
            )
            print(' ... saving model struture to {}'.format(f))

    # save test configuration
    with open(test_dir + '/eval_config.txt', 'w') as f:
        print('------------- test configuration -------------', file=f)
        for l, m in vars(config).items():
            print(('{}: {}').format(l, m), file=f)
        print(' ... loading test configuration ... ')
        print(' ... saving test configuration {}'.format(f))

    # define tensors
    if config.model == 'FutureGAN':
        print(' ... testing FutureGAN ...')
        if config.deep_pred > 1:
            print(
                ' ... recursively predicting {}x{} future frames from {} input frames ...'
                .format(config.deep_pred, config.nframes_pred, nframes_in))
        else:
            print(' ... predicting {} future frames from {} input frames ...'.
                  format(nframes_pred, nframes_in))
    z = Variable(
        torch.FloatTensor(config.batch_size, config.nc, nframes_in, img_size,
                          img_size))
    z_in = Variable(
        torch.FloatTensor(config.batch_size, config.nc, nframes_in, img_size,
                          img_size))
    x_pred = Variable(
        torch.FloatTensor(config.batch_size, config.nc, nframes_pred, img_size,
                          img_size))
    x = Variable(
        torch.FloatTensor(config.batch_size, config.nc, nframes, img_size,
                          img_size))
    x_eval = Variable(
        torch.FloatTensor(config.batch_size, config.nc, nframes_pred, img_size,
                          img_size))

    # define tensors for evaluation
    if config.metrics is not None:
        print(' ... evaluating {} ...'.format(model_name))
        if 'ms_ssim' in config.metrics and img_size < 32:
            raise ValueError(
                'For calculating `ms_ssim`, your dataset must consist of images at least of size 32x32!'
            )

        metrics_values = {}
        for metric_name in config.metrics:
            metrics_values['{}_frames'.format(metric_name)] = torch.zeros_like(
                torch.FloatTensor(test_len, nframes_pred))
            metrics_values['{}_avg'.format(metric_name)] = torch.zeros_like(
                torch.FloatTensor(test_len, 1))
            print(' ... calculating {} ...'.format(metric_name))

    # test loop
    if config.metrics is not None:
        metrics_i_video = {}
        for metric_name in config.metrics:
            metrics_i_video['{}_i_video'.format(metric_name)] = 0

    i_save_video = 1
    i_save_gif = 1

    for step in tqdm(range(len(data_iter_gt))):

        # input frames
        x.data = next(data_iter_gt)
        x_eval.data = x.data[:, :, nframes_in:, :, :]
        z.data = x.data[:, :, :nframes_in, :, :]

        if use_cuda:
            x = x.cuda()
            x_eval = x_eval.cuda()
            z = z.cuda()
            x_pred = x_pred.cuda()

        # predict video frames
        # !!! TODO !!! for deep_pred > 1: correctly implemented only if nframes_in == nframes_pred
        if config.model == 'FutureGAN':
            z_in.data = z.data
            for i_deep_pred in range(0, config.deep_pred):
                x_pred[:z_in.size(0), :, i_deep_pred *
                       config.nframes_pred:(i_deep_pred *
                                            config.nframes_pred) +
                       config.nframes_pred, :, :] = G(z_in).detach()
                z_in.data = x_pred.data[:, :,
                                        i_deep_pred * config.nframes_pred:
                                        (i_deep_pred * config.nframes_pred) +
                                        config.nframes_pred, :, :]

        elif config.model == 'CopyLast':
            for i_baseline_frame in range(x_pred.size(2)):
                x_pred.data[:x.size(0), :,
                            i_baseline_frame, :, :] = x.data[:, :, nframes_in -
                                                             1, :, :]

        else:
            x_pred.data = next(data_iter_pred)[:x.size(0), :,
                                               nframes_in:, :, :]

        # calculate eval statistics
        if config.metrics is not None:
            for metric_name in config.metrics:
                calculate_metric = getattr(eval_metrics,
                                           'calculate_{}'.format(metric_name))

                for i_batch in range(x.size(0)):
                    for i_frame in range(nframes_pred):
                        metrics_values['{}_frames'.format(metric_name)][
                            metrics_i_video['{}_i_video'.format(metric_name)],
                            i_frame] = calculate_metric(
                                x_pred[i_batch, :, i_frame, :, :],
                                x_eval[i_batch, :, i_frame, :, :])
                        metrics_values['{}_avg'.format(metric_name)][
                            metrics_i_video['{}_i_video'.format(
                                metric_name)]] = torch.mean(
                                    metrics_values['{}_frames'.format(
                                        metric_name)][metrics_i_video[
                                            '{}_i_video'.format(metric_name)]])
                    metrics_i_video['{}_i_video'.format(
                        metric_name
                    )] = metrics_i_video['{}_i_video'.format(metric_name)] + 1

        # save frames
        if config.save_frames_every is not 0 and config.model == 'FutureGAN':
            if step % config.save_frames_every == 0 or step == 0:
                for i_save_batch in range(x.size(0)):
                    if not os.path.exists(
                            sample_dir +
                            '/in_gt/video{:04d}'.format(i_save_video)):
                        os.makedirs(sample_dir +
                                    '/in_gt/video{:04d}'.format(i_save_video))
                    if not os.path.exists(
                            sample_dir +
                            '/in_pred/video{:04d}'.format(i_save_video)):
                        os.makedirs(
                            sample_dir +
                            '/in_pred/video{:04d}'.format(i_save_video))
                    for i_save_z in range(z.size(2)):
                        save_image_grid(
                            z.data[i_save_batch, :,
                                   i_save_z, :, :].unsqueeze(0), sample_dir +
                            '/in_gt/video{:04d}/video{:04d}_frame{:04d}_R{}x{}.png'
                            .format(i_save_video, i_save_video, i_save_z + 1,
                                    img_size, img_size), img_size, 1)
                        save_image_grid(
                            z.data[i_save_batch, :,
                                   i_save_z, :, :].unsqueeze(0), sample_dir +
                            '/in_pred/video{:04d}/video{:04d}_frame{:04d}_R{}x{}.png'
                            .format(i_save_video, i_save_video, i_save_z + 1,
                                    img_size, img_size), img_size, 1)
                    for i_save_x_pred in range(x_pred.size(2)):
                        save_image_grid(
                            x_eval.data[i_save_batch, :,
                                        i_save_x_pred, :, :].unsqueeze(0),
                            sample_dir +
                            '/in_gt/video{:04d}/video{:04d}_frame{:04d}_R{}x{}.png'
                            .format(i_save_video, i_save_video, i_save_x_pred +
                                    1 + nframes_in, img_size, img_size),
                            img_size, 1)
                        save_image_grid(
                            x_pred.data[i_save_batch, :,
                                        i_save_x_pred, :, :].unsqueeze(0),
                            sample_dir +
                            '/in_pred/video{:04d}/video{:04d}_frame{:04d}_R{}x{}.png'
                            .format(i_save_video, i_save_video, i_save_x_pred +
                                    1 + nframes_in, img_size, img_size),
                            img_size, 1)
                    i_save_video = i_save_video + 1

        # save gifs
        if config.save_gif_every is not 0:
            if step % config.save_gif_every == 0 or step == 0:
                for i_save_batch in range(x.size(0)):
                    if not os.path.exists(
                            sample_dir +
                            '/in_gt/video{:04d}'.format(i_save_gif)):
                        os.makedirs(sample_dir +
                                    '/in_gt/video{:04d}'.format(i_save_gif))
                    if not os.path.exists(
                            sample_dir +
                            '/in_pred/video{:04d}'.format(i_save_gif)):
                        os.makedirs(sample_dir +
                                    '/in_pred/video{:04d}'.format(i_save_gif))
                    frames = []
                    for i_save_z in range(z.size(2)):
                        frames.append(
                            get_image_grid(
                                z.data[i_save_batch, :,
                                       i_save_z, :, :].unsqueeze(0), img_size,
                                1, config.in_border, config.npx_border))
                    for i_save_x_pred in range(x_pred.size(2)):
                        frames.append(
                            get_image_grid(
                                x_eval.data[i_save_batch, :,
                                            i_save_x_pred, :, :].unsqueeze(0),
                                img_size, 1, config.out_border,
                                config.npx_border))
                    imageio.mimsave(
                        sample_dir +
                        '/in_gt/video{:04d}/video{:04d}_R{}x{}.gif'.format(
                            i_save_gif, i_save_gif, img_size, img_size),
                        frames)
                    frames = []
                    for i_save_z in range(z.size(2)):
                        frames.append(
                            get_image_grid(
                                z.data[i_save_batch, :,
                                       i_save_z, :, :].unsqueeze(0), img_size,
                                1, config.in_border, config.npx_border))
                    for i_save_x_pred in range(x_pred.size(2)):
                        frames.append(
                            get_image_grid(
                                x_pred.data[i_save_batch, :,
                                            i_save_x_pred, :, :].unsqueeze(0),
                                img_size, 1, config.out_border,
                                config.npx_border))
                    imageio.mimsave(
                        sample_dir +
                        '/in_pred/video{:04d}/video{:04d}_R{}x{}.gif'.format(
                            i_save_gif, i_save_gif, img_size, img_size),
                        frames)
                    i_save_gif = i_save_gif + 1

    if config.save_frames_every is not 0 and config.model == 'FutureGAN':
        print(' ... saving video frames to dir: {}'.format(sample_dir))
        if config.save_gif_every is not 0:
            print(' ... saving gifs to dir: {}'.format(sample_dir))

    # calculate and save mean eval statistics
    if config.metrics is not None:
        metrics_mean_values = {}
        for metric_name in config.metrics:
            metrics_mean_values['{}_frames'.format(metric_name)] = torch.mean(
                metrics_values['{}_frames'.format(metric_name)], 0)
            metrics_mean_values['{}_avg'.format(metric_name)] = torch.mean(
                metrics_values['{}_avg'.format(metric_name)], 0)
            torch.save(
                metrics_mean_values['{}_frames'.format(metric_name)],
                os.path.join(test_dir, '{}_frames.pt'.format(metric_name)))
            torch.save(metrics_mean_values['{}_avg'.format(metric_name)],
                       os.path.join(test_dir, '{}_avg.pt'.format(metric_name)))

        print(' ... saving evaluation statistics to dir: {}'.format(test_dir))
    def __init__(self, opt, shared=None):
        opt['batch_sort'] = False
        super().__init__(opt, shared)
        self.use_batch_act = self.bsz > 1
        self.num_workers = opt['numworkers']
        self.batch_sort = opt.get('pytorch_teacher_batch_sort') and \
            'train' in self.datatype
        self.batch_cache_type = opt.get('batch_sort_cache_type')
        self.batch_sort_field = opt.get('batch_sort_field')
        # One can specify a collate function to use for preparing a batch
        self.opt = opt.copy()
        self.is_shared = shared is not None
        dataset_classes = self.get_dataset_class(opt)
        self.ordered = ('ordered' in self.datatype or
                        ('stream' in self.datatype and not opt.get('shuffle')))
        if self.ordered:
            # force index for ordered, so that we see every example
            warn_once('\nNote: You are using PytorchDataTeacher with ordered '
                      'examples. Please specify `--shuffle` if you would like '
                      'to have examples loaded in randomized order.\n')
            self.batch_cache_type = 'index'

        if not shared:
            BatchSortCache.create()
            if len(dataset_classes) > 1:
                datasets = []
                for class_name, collate_fn, task_name in dataset_classes:
                    dataset_opt = opt.copy()
                    dataset_opt['pytorch_teacher_task'] = task_name
                    dataset_opt['task'] = task_name
                    datasets.append(class_name(dataset_opt))
                    self.collate_fn = collate_fn
                self.id = ','.join([d[2] for d in dataset_classes])
                self.dataset = ParlAIConcatDataset(datasets)
            else:
                class_name, self.collate_fn, task_name = dataset_classes[0]
                self.id = task_name
                self.dataset = class_name(opt)
            if self.ordered or not self.training:
                data_sampler = sampler.SequentialSampler(self.dataset)
            else:
                data_sampler = sampler.RandomSampler(self.dataset)

            self.pytorch_dataloader = DataLoader(
                self.dataset,
                batch_size=self.bsz,
                sampler=data_sampler,
                num_workers=self.num_workers,
                collate_fn=self.collate_fn,
                pin_memory=False,
                drop_last=False,
            )

            self.lastYs = [None] * self.bsz
            if self.batch_sort:
                self.loader_process = LoaderProcess(opt)
                self.loader_process.start()
            self.data = enumerate(self.pytorch_dataloader)
        else:
            self.dataset = shared['dataset']
            self.pytorch_dataloader = shared['pytorch_dataloader']
            self.lastYs = shared['lastYs']
            self.data = shared['data']
            self.id = shared['id']

        self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz)
        self.reset()
Esempio n. 25
0
    def train_sup(self, epoch_lim, data, valid_data, early_stopping_lim,
                  batch_size, num_workers, track_embeddings, validation_rate, loss_weight_base=1,
                  value_weight=0, value_ratio=0):
        """
        Training loop
        :param epoch_lim: total number of training epochs
        :param data: training data
        :param valid_data: validation data
        :param early_stopping_lim: Number of epochs to run without validation improvement before stopping
        if None, never stop early
        :param batch_size: training batch_size
        :param num_workers: number of CPU workers to use for data loading
        :param track_embeddings: Save out embedding information at end of run
        :param validation_rate: Check validation performance every validation_rate training epochs
        :param loss_weight_base: A constant between 0 and 1 used to interpolate between Single (=0) and Multi (=1) Step forecasting.
        :param value_weight: A constant multiplier for the real-value loss, set to 0 in the paper
        :param value_ratio: The proportion of loss used for the MSE loss term (as opposed for the cross-entropy loss), set to 0 in the paper
        :return loss array, model:
        """
        if early_stopping_lim is None:
            early_stopping_lim = epoch_lim
        train_sampler = sampler.RandomSampler(np.arange(len(data)))
        data_train = DataLoader(data,
                                batch_size=batch_size,
                                sampler=train_sampler,
                                drop_last=True)

        valid_sampler = sampler.SequentialSampler(np.arange(len(valid_data)))
        data_valid = DataLoader(valid_data,
                                batch_size=batch_size,
                                sampler=valid_sampler)
        step = 0

        bsf_loss = np.inf
        epochs_without_improvement = 0
        improvements = []
        for epoch in range(epoch_lim):
            if epochs_without_improvement > early_stopping_lim:
                print('Exceeded early stopping limit, stopping')
                break
            if epoch % validation_rate == 0:
                valid_loss = self.validation(data_valid=data_valid,
                                             step=step,
                                             data=data,
                                             loss_weight_base=loss_weight_base,
                                             value_weight=value_weight, value_ratio=value_ratio)
                (bsf_loss,
                 epochs_without_improvement,
                 improvements) = self.manage_early_stopping(bsf_loss=bsf_loss,
                                                            early_stopping_lim=early_stopping_lim,
                                                            epochs_without_improvement=epochs_without_improvement,
                                                            valid_loss=valid_loss, validation_rate=validation_rate,
                                                            improvements=improvements)
            running_train_loss = 0
            for inp, out, out_real, lens in tqdm(data_train):
                loss, y_p = forecast_model.get_loss(inp=inp,
                                                    out=out,
                                                    lens=lens,
                                                    cuda=True,
                                                    gn=self.model,
                                                    glucose_dat=data,
                                                    criterion=self.criterion,
                                                    base=loss_weight_base,
                                                    out_real=out_real,
                                                    value_weight=value_weight,
                                                    value_ratio=value_ratio)
                step += 1
                running_train_loss += loss.data.cpu().numpy()[0]
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            running_train_loss = running_train_loss/len(data_train)
            self.writer.add_scalar(tag='train_loss',
                                   scalar_value=running_train_loss,
                                   global_step=step)
        torch.save(self.model.state_dict(), '{}/final_sup.pt'.format(self.model_dir))
        if track_embeddings:
            self.embed(data_valid, step, embed_batch=100)
        return improvements
Esempio n. 26
0
def main():

    data_dir = sys.argv[1]
    hero2ix_dir = sys.argv[2]

    # import DataFrame and hero2ix dictionary
    heroes_df_dota = pd.read_csv(data_dir, index_col=0)
    heroes_df_dota = heroes_df_dota.dropna().reset_index(drop=True)

    with open(hero2ix_dir, 'r') as fp:
        hero2ix = json.load(fp)

    print(len(heroes_df_dota))
    # train test split
    split_1 = int(len(heroes_df_dota) * 0.8)
    split_2 = int(len(heroes_df_dota) * 0.9)
    heroes_train_dota = heroes_df_dota.iloc[:split_1]
    heroes_dev_dota = heroes_df_dota.iloc[split_1:split_2]
    heroes_test_dota = heroes_df_dota.iloc[split_2:]

    # build dataset generator
    train_gen = DataFrameIterator(heroes_train_dota, hero2ix)
    dev_gen = DataFrameIterator(heroes_dev_dota, hero2ix)
    test_gen = DataFrameIterator(heroes_test_dota, hero2ix)

    # Use Dataloader class in pytorch to generate batched data
    batch_size = 16
    loader_train = DataLoader(train_gen,
                              batch_size=batch_size,
                              sampler=sampler.RandomSampler(train_gen),
                              num_workers=4)
    loader_dev = DataLoader(dev_gen,
                            batch_size=batch_size,
                            sampler=sampler.RandomSampler(dev_gen),
                            num_workers=4)

    loader_test = DataLoader(test_gen,
                             batch_size=batch_size,
                             sampler=sampler.SequentialSampler(test_gen),
                             num_workers=4)

    # define model, totally three models in hetor2vec.py
    # model = CBOHBilayer(embedding_dim=20, heropool_size=len(hero2ix))
    model = CBOHBilayer(embedding_dim=20,
                        heropool_size=len(hero2ix),
                        hidden_dim=20)

    # define loss function
    loss_function = nn.CrossEntropyLoss()

    # run train
    losses = train(model=model,
                   dataloader=loader_train,
                   devloader=loader_dev,
                   loss_function=loss_function,
                   init_lr=0.1,
                   epochs=20,
                   lr_decay_epoch=8,
                   print_epoch=2,
                   gpu=True)

    # check test accuracy
    print(
        'Top3, Top5 and Top 10 accuracy: ',
        accuracy_in_train(model,
                          dataloader=loader_test,
                          batch_size=batch_size,
                          gpu=False))

    # save embeddings as numpy arrays
    output_dir = './output/hero/hero_embeddings.npy'
    save_embeddings(model, filename=output_dir)

    # pickle model
    pickle_dir = './output/hero/model.p'
    pickle.dump(obj=model, file=open(pickle_dir, 'wb'))
    # np.save('loss0',losses[0])
    # np.save('loss1',losses[1])
    # np.save('loss2',losses[2])
    # np.save('loss3',losses[3])

    # # plot loss vs epoch
    plot_loss(losses, './output/hero/loss_hitory.png')

    # project embeddings to 2d plane
    plot_embeddings(model, hero2ix)
Esempio n. 27
0
		drop_last   = kwargs.get('drop_last', False)
		if isinstance(sampler, torchsampler.BatchSampler):
			return sampler
		if sampler == None:
			sampler = torchsampler.RandomSampler(self.dataset)
		elif not isinstance(sampler, torchsampler.Sampler):
			sampler = torchsampler.RandomSampler(sampler)
		return torchsampler.BatchSampler(sampler, batch_size, drop_last)

# Test this module
if __name__ == '__main__':
	N_TESTS = 4
	passed = 0
	dataset = Dataset(transform=unrel.TRANSFORM)
	# Test on a subset sampler
	batch_sampler = torchsampler.SequentialSampler(range(14))
	dataloader = FauxDataLoader(dataset, sampler=batch_sampler)
	for batch_i, batch in enumerate(dataloader):
		assert isinstance(batch['image'], list)
		for image in batch['image']:
			assert isinstance(image, torch.Tensor)
		print('dataset count %3d / %3d' % ((1+batch_i) * dataloader.sampler.batch_size, len(dataloader)))
	passed += 1; print('OK %d/%d' % (passed, N_TESTS))
	# Test on a batched subset sampler
	batch_sampler = torchsampler.BatchSampler(torchsampler.SequentialSampler(range(14)), 3, False)
	dataloader = FauxDataLoader(dataset, sampler=batch_sampler)
	for batch_i, batch in enumerate(dataloader):
		assert isinstance(batch['image'], list)
		for image in batch['image']:
			assert isinstance(image, torch.Tensor)
		print('dataset count %3d / %3d' % ((1+batch_i) * dataloader.sampler.batch_size, len(dataloader)))
def train_template_network(loss='default'):
    """Obtain CIFAR10-trained template network.

    Training parameters follow original ResNet paper.

    Args:
        loss: Choose from 'default'/'sgm'/'l2'
    """

    # Use training parameters of original ResNet paper
    split_index = 45000
    batch_size = 128
    lr = 1e-1
    momentum = 0.9
    weight_decay = 1e-4
    epoch = 180
    decay_milestones = [90, 120]
    decay_factor = 0.1

    # SGM/L2 specific parameters
    aux_loss_wt = 0.02

    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32, 4),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    image_datasets = {
        x: datasets.CIFAR10(root=args.cifar10_dir,
                            train=y,
                            download=True,
                            transform=z)
        for x, y, z in zip([0, 1], [True, False],
                           [train_transform, test_transform])
    }
    dataloaders = {
        x: DataLoader(image_datasets[y],
                      batch_size=batch_size,
                      sampler=z,
                      num_workers=args.num_workers,
                      pin_memory=('cpu' not in args.device))
        for x, y, z in zip(['train', 'val', 'test'], [0, 0, 1], [
            sampler.SubsetRandomSampler(range(split_index)),
            sampler.SubsetRandomSampler(
                range(split_index, len(image_datasets[0]))),
            sampler.SequentialSampler(image_datasets[1])
        ])
    }
    dataset_sizes = {
        'train': split_index,
        'val': len(image_datasets[0]) - split_index,
        'test': len(image_datasets[1])
    }

    model = mutil.get_model(args.arch).to(device)
    if loss == 'default':
        criterion = torch.nn.CrossEntropyLoss().to(device)
    elif loss in ('sgm', 'l2'):
        criterion = GenericLoss(loss, aux_loss_wt, model.linear.out_features)
    else:
        raise NameError('{} is not recognized.'.format(loss))

    optimizer = torch.optim.SGD(mutil.get_model_trainable_parameters(model),
                                lr=lr,
                                momentum=momentum,
                                weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=decay_milestones, gamma=decay_factor)
    model, _ = mutil.train_model(model,
                                 criterion,
                                 optimizer,
                                 dataloaders,
                                 dataset_sizes,
                                 scheduler=scheduler,
                                 num_epochs=epoch,
                                 device=device)
    mutil.eval_model(model,
                     dataloaders['test'],
                     dataset_sizes['test'],
                     device=device)

    return model
    def extract_activation(self, data, labels=None, customfunction=None):

        datapath = None
        cachekey, cacheddata = get_cached_data(data, self.modelpath, self.layers,  \
            conditional=self.conditional, classpath=self.classpath, customfunction=customfunction)
        if cachekey is not None:
            if isinstance(cacheddata, tuple):
                return cacheddata[0], cacheddata[1]
            else:
                return cacheddata, None

        self.activations = collections.defaultdict(list)

        if isinstance(data, str):
            datapath = data
            data = np.load(data)
        elif isinstance(data, np.ndarray):
            data = data

        if isinstance(labels, str):
            labels = np.load(labels)
        elif isinstance(labels, np.ndarray):
            labels = labels

        channel = data.shape[3]
        # transpose data if data in nhwc format i.e channel is 3
        #TODO more elegant to assert that the shape of the data based on the input expected by the model

        if channel == 3:
            data = data.transpose(0, 3, 1, 2)

        torchdata = torch.stack([torch.Tensor(i) for i in data])

        dataset = tdata.TensorDataset(torchdata)
        dataloader = tdata.DataLoader(dataset, batch_size=100, shuffle=False, num_workers=2, \
            sampler=sampler.SequentialSampler(dataset))

        y_prob = None

        for idx, inputs in enumerate(dataloader):
            y_pred = self.modelinstance(inputs[0])
            y_prob_ = np.argmax(y_pred.detach().numpy(), axis=1)

            if y_prob is None:
                y_prob = y_prob_
            else:
                y_prob = np.concatenate((y_prob, y_prob_), axis=0)
            # break;
        activations = {
            name: torch.cat(outputs, 0)
            for name, outputs in self.activations.items()
        }
        merged_acts = None

        if (not activations):
            raise ValueError('Could not extract from specified layer')

        for _, act in activations.items():
            act = act.detach().numpy()
            # apply transformation on activations as defined by custom function
            if customfunction is not None and callable(customfunction):
                _acts = customfunction(act)
                act = _acts

            if len(act.shape) == 4:
                act = np.reshape(
                    act,
                    (act.shape[0], act.shape[1] * act.shape[2] * act.shape[3]))
            else:
                act = np.reshape(act, (act.shape[0], act.shape[1]))

            if merged_acts is None:
                merged_acts = act
            else:
                merged_acts = np.concatenate((merged_acts, act), axis=-1)

        if self.conditional:
            if (labels is not None):
                y_prob = labels
                y_prob = y_prob.reshape((-1, ))

            unique_labels = set(y_prob)
            conditional_f_acts = {}
            for label in unique_labels:
                indices = [x == label for x in y_prob]
                selected_f_acts = merged_acts[indices]
                conditional_f_acts[label] = selected_f_acts

            cachekey = write_to_cache(conditional_f_acts, datapath, self.modelpath, self.layers, \
                classpath=self.classpath, conditional=self.conditional)
            return conditional_f_acts, None

        else:
            cachekey = write_to_cache((merged_acts, y_prob), datapath, self.modelpath, self.layers, \
                classpath=self.classpath, conditional=self.conditional)
            return merged_acts, y_prob
Esempio n. 30
0
                            weight_method=params['weight_method'],
                            create_cache=params['create_cache'],
                            num_channels=1))

    dataset = ConcatDataset(dataset) if len(dataset) > 1 else dataset[0]
    target_type = params['target_type'] if params['target_type'] != 'spatial_bootstrap' else 'psa'
    val_dataset = WSJ0(folder=params['validation_folder'],
                       length='full',
                       n_fft=params['n_fft'],
                       hop_length=params['hop_length'],
                       output_type=target_type,
                       create_cache=True, #params['create_cache'],
                       num_channels=1)

if args.sample_strategy == 'sequential':
    sample_strategy = sampler.SequentialSampler(dataset)
elif args.sample_strategy == 'random':
    sample_strategy = sampler.RandomSampler(dataset)

dataloader = DataLoader(dataset,
                            batch_size=params['batch_size'],
                            num_workers=params['num_workers'],
                            sampler=sample_strategy)

dummy_input, _, _, _, _, dummy_one_hot = dataset[0]

params['num_attractors'] = dummy_one_hot.shape[-1]
params['num_sources'] = params['num_attractors']
params['sample_rate'] = dataset.sr
dataset.reorder_sources = args.reorder_sources
val_dataset.reorder_sources = args.reorder_sources