Ejemplo n.º 1
0
def get_data(transform, mode='train'):
    print('Loading data for "%s" ...' % mode)
    global dataset
    if args.dataset == 'ucf101':
        dataset = UCF101Dataset(mode=mode,
                                transform=transform,
                                seq_len=args.seq_len,
                                num_seq=args.num_seq,
                                downsample=args.ds,
                                which_split=args.split,
                                return_label=True)
    elif args.dataset == 'hmdb51':
        dataset = HMDB51Dataset(mode=mode,
                                transform=transform,
                                seq_len=args.seq_len,
                                num_seq=args.num_seq,
                                downsample=args.ds,
                                which_split=args.split,
                                return_label=True)
    elif args.dataset.split('_')[0] == 'CATER':
        dataset = CATERDataset(
            mode=mode,
            task=args.dataset.split('_', 1)[1],
            transform=transform,
            seq_len=args.seq_len,
            num_seq=args.num_seq,
            downsample=args.ds,
            #which_split=args.split,
            return_label=True)
    else:
        raise ValueError('dataset not supported')
    my_sampler = data.RandomSampler(dataset)
    if mode == 'train':
        data_loader = data.DataLoader(dataset,
                                      batch_size=args.batch_size,
                                      sampler=my_sampler,
                                      shuffle=False,
                                      num_workers=args.workers,
                                      pin_memory=True,
                                      drop_last=True)
    elif mode == 'val':
        data_loader = data.DataLoader(dataset,
                                      batch_size=args.batch_size,
                                      sampler=my_sampler,
                                      shuffle=False,
                                      num_workers=args.workers,
                                      pin_memory=True,
                                      drop_last=True)
    elif mode == 'test':
        data_loader = data.DataLoader(dataset,
                                      batch_size=1,
                                      sampler=my_sampler,
                                      shuffle=False,
                                      num_workers=args.workers,
                                      pin_memory=True)
    print('"%s" dataset size: %d' % (mode, len(dataset)))
    return data_loader, dataset
Ejemplo n.º 2
0
def make_data_sampler(dataset,
                      is_train=True,
                      shuffle=True,
                      is_distributed=False):
    if is_train:
        sampler = dutils.RandomSampler(dataset)
    else:
        sampler = dutils.SequentialSampler(dataset)
    return sampler
Ejemplo n.º 3
0
def get_random_data_loader(opt):
  '''
  Sample random batch. Used for evaluation during training.
  '''
  dset = get_dataset(opt)
  # Random sampler
  sampler = data.RandomSampler(dset)
  dloader = data.DataLoader(dset, batch_size=opt.batch_size, sampler=sampler)
  return dloader
Ejemplo n.º 4
0
def data_sampler(dataset, shuffle, distributed):
    if distributed:
        return data.distributed.DistributedSampler(dataset, shuffle=shuffle)

    if shuffle:
        return data.RandomSampler(dataset)

    else:
        return data.SequentialSampler(dataset)
Ejemplo n.º 5
0
    def __init__(self, dataset, batch_size, device='cpu'):
        super().__init__()
        self._dataset = dataset
        self.batch_size = batch_size
        self.device = device

        self._sampler = data.BatchSampler(data.RandomSampler(
            self._dataset,
            replacement=False),
            self.batch_size, False)
Ejemplo n.º 6
0
def prepare_loaders(args, datasets):
    collate_fn = collate_fns[args.task_type]
    loaders = {'source': {}, 'target': {}}
    loaders['source']['train'] = torchdata.DataLoader(
        datasets['source']['train'],
        batch_size=args.batch_size,
        num_workers=args.nthreads,
        sampler=torchdata.RandomSampler(datasets['source']['train'],
                                        replacement=True),
        collate_fn=collate_fn,
        drop_last=True)
    loaders['target']['labeled'] = torchdata.DataLoader(
        datasets['target']['labeled'],
        batch_size=args.batch_size,
        num_workers=args.nthreads,
        sampler=torchdata.RandomSampler(datasets['target']['labeled'],
                                        replacement=True),
        collate_fn=collate_fn,
        drop_last=True)
    loaders['target']['unlabeled'] = torchdata.DataLoader(
        datasets['target']['unlabeled'],
        batch_size=args.batch_size,
        num_workers=args.nthreads,
        sampler=torchdata.RandomSampler(datasets['target']['unlabeled'],
                                        replacement=True),
        collate_fn=collate_fn,
        drop_last=True)

    loaders['source']['validation'] = torchdata.DataLoader(
        datasets['source']['validation'],
        batch_size=1,
        num_workers=args.nthreads,
        collate_fn=collate_fn,
    )

    loaders['target']['validation'] = torchdata.DataLoader(
        datasets['target']['validation'],
        batch_size=1,
        num_workers=args.nthreads,
        collate_fn=collate_fn,
    )
    return loaders
Ejemplo n.º 7
0
def init_train_loader(args, path_to_source_train, path_to_sampled_train):
    fmri_train = fastMRIData(path_to_source_train, path_to_sampled_train)
    if args.random_subset:
        sampler = torch_data.RandomSampler(fmri_train, replacement=True, num_samples=args.random_subset)
        train_loader = torch_data.DataLoader(fmri_train, sampler=sampler, batch_size=args.train_batch_size,
                                             shuffle=False, num_workers=args.loader_workers)
    else:
        train_loader = torch_data.DataLoader(fmri_train, batch_size=args.train_batch_size,
                                             shuffle=True, num_workers=args.loader_workers)

    return train_loader
Ejemplo n.º 8
0
 def run_with_epoch(self):
     progress_bar = progressbar.ProgressBar(max_value=self.num_epoch)
     sampler = data.RandomSampler(torch.arange(self.num_data), replacement=True, num_samples=self.num_epoch)
     self.data_generator = data.DataLoader(mdp_dataset(self), batch_size=self.batch_size, sampler=sampler, num_workers=self.num_workers, drop_last=False)
     for batch_A_t, batch_b_t, batch_C_t, batch_t_m in self.data_generator:
         batch_size = batch_t_m.shape[0]
         for j in range(batch_size):
             A_t, b_t, C_t, t_m = svrg.get_stoc_data(self, batch_A_t, batch_b_t, batch_C_t, batch_t_m, j)
             self.theta.sub_(torch.mul(mspbe.mspbe_grad_theta(self.theta, self.omega, A_t, rho=self.rho), self.sigma_theta))
             self.omega.sub_(torch.mul(mspbe.mspbe_grad_omega(self.theta, self.omega, A_t, b_t, C_t, self.rho_omega), self.sigma_omega))
             self.end_of_epoch()
         progress_bar.update(self.cur_epoch)
Ejemplo n.º 9
0
def data_sampler(dataset, shuffle, distributed, weights=None):
    if distributed:
        return data.distributed.DistributedSampler(dataset, shuffle=shuffle)

    if weights is not None:
        return data.WeightedRandomSampler(weights,
                                          len(weights),
                                          replacement=True)

    if shuffle:
        return data.RandomSampler(dataset)
    else:
        return data.SequentialSampler(dataset)
Ejemplo n.º 10
0
    def _split(self, valid_rate, shuffle_seed):
        self.indices = list(range(self.dataset_size))
        random.seed(shuffle_seed)
        random.shuffle(self.indices)
        split = int(np.floor((1 - valid_rate) * self.dataset_size))

        self.train_indices, self.valid_indices = self.indices[:split], self.indices[split:]
        self.train_dataset = data.Subset(self, self.train_indices)
        self.valid_dataset = data.Subset(self, self.valid_indices)

        self.train_sampler = data.RandomSampler(self.train_dataset)
        self.valid_sampler = data.SequentialSampler(self.valid_dataset)
        self.test_sampler = data.SequentialSampler(self)
Ejemplo n.º 11
0
    def _run(self):
        svrg.load_mdp_data(self)
        svrg.init_alg(self)
        full_dataset = mdp_dataset(self)
        scsg_batch_size = int(self.num_data * self.scsg_batch_size_ratio)
        geom_dist_p = 1/(scsg_batch_size+1)
        #rho = 1e-2*mspbe.calc_L_rho(self)

        if self.terminate_if_less_than_epsilon==False: progress_bar = progressbar.ProgressBar(max_value=self.num_epoch+50)
        while self.check_termination_cond():
            theta_tilde = self.theta.clone()
            omega_tilde = self.omega.clone()
            theta_tilde_grad, omega_tilde_grad = self.get_grad_theta_omega_from_batch_abc(self.theta, self.omega, full_dataset, torch.randperm(self.num_data)[:scsg_batch_size], scsg_batch_size, self.rho)

            torch.cuda.empty_cache()
            self.num_grad_eval += scsg_batch_size
            if self.record_per_dataset_pass: self.check_complete_data_pass()

            if self.use_geometric_dist:
                inner_loop_epoch = np.random.geometric(geom_dist_p)
            else:
                inner_loop_epoch = int(self.num_data * self.scsg_batch_size_ratio)
            sampler = data.RandomSampler(torch.arange(self.num_data), replacement=True, num_samples=inner_loop_epoch)
            data_generator = data.DataLoader(full_dataset, batch_size=self.batch_size, sampler=sampler, num_workers=self.num_workers, drop_last=False)

            for batch_A_t, batch_b_t, batch_C_t, batch_t_m in data_generator:
                batch_size = batch_t_m.shape[0]
                for j in range(batch_size):
                    A_t, b_t, C_t, t_m = svrg.get_stoc_data(self, batch_A_t, batch_b_t, batch_C_t, batch_t_m, j)
                    theta_grad = mspbe.mspbe_grad_theta(self.theta, self.omega, A_t, rho=self.rho) + theta_tilde_grad - mspbe.mspbe_grad_theta(theta_tilde, omega_tilde, A_t, rho=self.rho)
                    omega_grad = mspbe.mspbe_grad_omega(self.theta, self.omega, A_t, b_t, C_t, self.rho_omega) + omega_tilde_grad - mspbe.mspbe_grad_omega(theta_tilde,omega_tilde,A_t,b_t,C_t, self.rho_omega)
                    self.theta.sub_(torch.mul(theta_grad, self.sigma_theta))
                    self.omega.sub_(torch.mul(omega_grad, self.sigma_omega))
            self.num_grad_eval += inner_loop_epoch
            if self.record_per_dataset_pass: self.check_complete_data_pass()
            if self.record_before_one_pass: self.record_value_before_one_pass()

            # Temporary
            mspbe_at_epoch = float(mspbe.calc_mspbe_torch(self, self.rho).cpu().numpy())
            print('scsg ratio = '+ str(self.scsg_batch_size_ratio) + ' sigma_theta =' + str(self.sigma_theta) + ' sigma_omega = ' + str(self.sigma_omega) + ' scsg mspbe = %.5f' % (mspbe_at_epoch))

            self.end_of_epoch()
            if self.terminate_if_less_than_epsilon==False: progress_bar.update(self.num_pass) if self.record_per_dataset_pass else progress_bar.update(self.cur_epoch)

        svrg.end_of_exp(self)
        #Temporary
        if self.record_before_one_pass:
            return {'record_points_before_one_pass':self.record_points_before_one_pass, 'use_geom_dist':self.use_geometric_dist, 'theta':self.theta, 'omega':self.omega, 'result': self.result, 'sigma_theta': self.sigma_theta, 'sigma_omega': self.sigma_omega,'name': self.name, 'scsg_batch_size_ratio':self.scsg_batch_size_ratio, 'record_per_dataset_pass':self.record_per_dataset_pass, 'record_per_epoch':self.record_per_epoch, 'comp_cost':self.num_pass, 'rho': self.rho, 'rho_ac': self.rho_ac}
        else:
            return {'use_geom_dist': self.use_geometric_dist, 'theta': self.theta, 'omega': self.omega, 'result': self.result, 'sigma_theta': self.sigma_theta, 'sigma_omega': self.sigma_omega,
                    'name': self.name, 'scsg_batch_size_ratio': self.scsg_batch_size_ratio, 'record_per_dataset_pass': self.record_per_dataset_pass, 'record_per_epoch': self.record_per_epoch, 'comp_cost': self.num_pass, 'rho': self.rho, 'rho_ac': self.rho_ac}
Ejemplo n.º 12
0
def data_loader(root, phase, batch_size, tokenizer, config):
    dataset = load_and_cache_examples(root, tokenizer, config=config, mode=phase)

    if phase == 'train':
        sampler = data.RandomSampler(dataset)
    else:
        sampler = data.SequentialSampler(dataset)

    dataloader = data.DataLoader(dataset=dataset, sampler=sampler, batch_size=batch_size)
    return dataloader


# from transformers import AutoTokenizer
# dataloader = data_loader('/home/ubuntu/aikorea/sbs/data', 'train', 32, AutoTokenizer.from_pretrained(config.bert_model_name))
# print(len(dataloader))
Ejemplo n.º 13
0
def get_data(transform, mode='test'):
    print('Loading data for "%s" ...' % mode)
    dataset = deepfake_3d(out_dir=args.out_dir, mode=mode, transform=transform)

    sampler = data.RandomSampler(dataset)

    if mode == 'test':
        data_loader = data.DataLoader(dataset,
                                      batch_size=1,
                                      sampler=sampler,
                                      shuffle=False,
                                      num_workers=32,
                                      pin_memory=True,
                                      collate_fn=my_collate)
    print('"%s" dataset size: %d' % (mode, len(dataset)))
    return data_loader
Ejemplo n.º 14
0
  def train_network(self, X_inputs, Y_labels):
    optimizer = optim.Adam(self.neural_network.parameters())
    X_inputs = torch.from_numpy(X_inputs).double()
    Y_labels = torch.from_numpy(Y_labels).double().view(len(Y_labels), 1)

    self.neural_network.train(True)
    for iteration in range(self.nb_iters):
      for batch in tdata.BatchSampler(
              tdata.RandomSampler(range(len(X_inputs)), replacement=False),
              batch_size=self.batch_size, drop_last=False):
        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
          outputs = self.neural_network(X_inputs[batch])
          loss = nn.MSELoss(reduction="mean")(outputs, Y_labels[batch])
          loss.backward()
          optimizer.step()
Ejemplo n.º 15
0
Archivo: train.py Proyecto: ririw/mac
    def main(self, clevr_dir, preproc_dir, results_loc, log_loc=None):
        logging.basicConfig(level=logging.INFO)
        utils.cuda_message()
        np.printoptions(linewidth=139)

        clevr_fs = open_fs(clevr_dir, create=False)
        preproc_fs = open_fs(preproc_dir, create=True)

        dataset = datasets.TaskDataset(clevr_fs, preproc_fs, "train")
        total_words = len(dataset.word_ix) + 1
        logging.info("Total words: %s", total_words)
        sampler = data.BatchSampler(data.RandomSampler(dataset), 32, False)

        net = mac.MACNet(mac.MACRec(12, 512),
                         total_words).to(config.torch_device())
        opt = torch.optim.Adam(net.parameters())

        if log_loc:
            now = datetime.datetime.now()
            log_dir = f"{log_loc}/new-{now}"
            writer = tensorboardX.SummaryWriter(log_dir)
        else:
            writer = None

        step = 0
        rolling_accuracy = 0
        for epoch in range(10):
            bar = tqdm(sampler)
            for batch_ix in bar:
                opt.zero_grad()
                images, qns, qn_lens, answers = dataset[batch_ix]
                predictions = net(images, qns, qn_lens)

                loss = functional.cross_entropy(predictions, answers)
                loss.backward()
                opt.step()
                hard_preds = np.argmax(predictions.detach().cpu().numpy(), 1)
                accuracy = (
                    hard_preds == answers.detach().cpu().numpy()).mean()
                if writer is not None:
                    writer.add_scalar("loss", loss.item(), step)
                    writer.add_scalar("accuracy", accuracy, step)

                rolling_accuracy = rolling_accuracy * 0.95 + accuracy * 0.05
                bar.set_description("Accuracy: {}".format(rolling_accuracy))

                step += 1
Ejemplo n.º 16
0
def get_data(transform, mode='train'):
    print('Loading data for "%s" ...' % mode)
    if args.dataset == 'k400':
        use_big_K400 = args.img_dim > 140
        dataset = Kinetics400_full_3d(mode=mode,
                                      transform=transform,
                                      seq_len=args.seq_len,
                                      num_seq=args.num_seq,
                                      downsample=5,
                                      big=use_big_K400)
    elif args.dataset == 'ucf101':
        dataset = UCF101_3d(mode=mode,
                            transform=transform,
                            seq_len=args.seq_len,
                            num_seq=args.num_seq,
                            downsample=args.ds)
    elif args.dataset == 'nturgbd':
        dataset = NTURGBD_3D(mode=mode,
                             transform=transform,
                             seq_len=args.seq_len,
                             num_seq=args.num_seq,
                             downsample=args.ds,
                             train_csv=args.train_csv,
                             val_csv=args.test_csv)
    else:
        raise ValueError('dataset not supported')

    sampler = data.RandomSampler(dataset)

    if mode == 'train':
        data_loader = data.DataLoader(dataset,
                                      batch_size=args.batch_size,
                                      sampler=sampler,
                                      shuffle=False,
                                      num_workers=32,
                                      pin_memory=True,
                                      drop_last=True)
    elif mode == 'val':
        data_loader = data.DataLoader(dataset,
                                      batch_size=args.batch_size,
                                      sampler=sampler,
                                      shuffle=False,
                                      num_workers=32,
                                      pin_memory=True,
                                      drop_last=True)
    print('"%s" dataset size: %d' % (mode, len(dataset)))
    return data_loader
 def data_iterator(self):
     while True:
         if self.training:
             random.shuffle(self.files)
         for f_id in range(self.num_files):
             data_file = self.files[f_id]
             train_data = BertPretrainingPreprocessedDataset(
                 input_file=data_file, max_pred_length=self.max_pred_length)
             train_sampler = pt_data.RandomSampler(train_data)
             train_dataloader = pt_data.DataLoader(
                 dataset=train_data,
                 batch_size=self.batch_size,
                 collate_fn=self._collate_fn,
                 shuffle=train_sampler is None,
                 sampler=train_sampler)
             for x in train_dataloader:
                 yield x
Ejemplo n.º 18
0
    def data_loader_from_dataset(dset, batch_size=64, num_workers=2,
                      batches_per_epoch=None, random_sample=True,
                      shuffle=False, **kwargs):
        if random_sample:
            if batches_per_epoch is None:
                batches_per_epoch = len(dset) // batch_size

            dataloader = data.DataLoader(dset, batch_size=batch_size,
                                          sampler=data.RandomSampler(dset,
                                                                      replacement=True,
                                                                      num_samples=batches_per_epoch * batch_size),
                                          shuffle=shuffle, num_workers=num_workers,
                                          **kwargs)
        else:
            dataloader = data.DataLoader(dset, batch_size=batch_size,
                                          shuffle=shuffle, num_workers=num_workers,
                                          **kwargs)
        return dataloader
Ejemplo n.º 19
0
    def train_dataset(self, 
            dataset=None, 
            batch_size=32,
            iters_per_validation=1000,
            early_stopping=True,
            early_stopping_patience=10,
            validation_fraction=0.1,
            num_workers=2):
        
        validation_size = int(len(dataset[0]) * validation_fraction) + 1
        training_set = Dataset([d[:-validation_size] for d in dataset])
        training_loader = D.DataLoader(
                training_set,
                batch_size=batch_size,
                sampler=D.RandomSampler(training_set, replacement=True), 
                pin_memory=True,
                num_workers=2)
        validation_set = Dataset([d[-validation_size:] for d in dataset])
        validation_loader = D.DataLoader(
                validation_set, 
                batch_size=1024, 
                num_workers=2)
        
        fail = 0
        #states, values, variance, policy, weights
        loss_avg = 0
        idx = 0
        while True:
            for batch in training_loader:
                idx += 1
                l = self.train(batch)
                loss_avg += l[0]
                if ( idx + 1 ) % iters_per_validation == 0:

                    l_val = 0
                    """
                    for b in validation_loader:
                        l = self.compute_loss(b)
                        l_val += l[0] * len(b[0])
                    l_val /= validation_size
                    """
                    print(loss_avg/iters_per_validation, l_val)
Ejemplo n.º 20
0
    def _make_batch_loader(self, batch_size=None, shuffle=None, num_samples=200000):
        nb_threads = self.nb_threads
        batch_size = self.batch_size if batch_size is None else batch_size
        shuffle = self.shuffle if shuffle is None else shuffle

        if shuffle:
            sampler = data.RandomSampler(self, replacement=True, num_samples=min(num_samples, len(self)))
            shuffle = None
        else:
            sampler = None

        batch_loader = data.DataLoader(
            dataset=self,
            batch_size=batch_size,
            shuffle=shuffle,
            pin_memory=self.pin_memory,
            num_workers=nb_threads,
            collate_fn=self.collate_fn,
            sampler=sampler)
        return batch_loader
Ejemplo n.º 21
0
def get_tinyImgNet_train_loader(batch_size,
                                shuffle=True,
                                transform_type='none',
                                bootstrap=-1):
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    if transform_type == 'all':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(size=64,
                                         scale=(0.2, 1),
                                         ratio=(0.8, 1.2)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(), normalize
        ])
    elif transform_type == 'flip':
        train_transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(), normalize
        ])
    elif transform_type == 'none':
        train_transform = transforms.Compose(
            [transforms.ToTensor(), normalize])
    else:
        raise ValueError(
            "'transform_type' should be 'none', 'flip', or 'all'. Got {}.".
            format(transform_type))

    dset = TINDataset(is_train=True, transform=train_transform)

    if bootstrap > 0:
        # NOTE: when using a sampler, 'shuffle' has to be False.
        sampler = data.RandomSampler(dset,
                                     replacement=True,
                                     num_samples=int(
                                         min(1, bootstrap) * len(dset)))
        return data.DataLoader(dset,
                               batch_size=batch_size,
                               shuffle=False,
                               sampler=sampler)
    else:
        return data.DataLoader(dset, batch_size=batch_size, shuffle=shuffle)
Ejemplo n.º 22
0
def get_dataloader(dataset, mode, args):
    print("Creating data loaders")
    train_sampler = data.RandomSampler(dataset) 
    val_sampler = None 

    if mode == 'train':
        data_loader = data.DataLoader(
            dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
            num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
    
    elif mode == 'val':
        data_loader = data.DataLoader(
            dataset, batch_size=args.batch_size, shuffle=(val_sampler is None),
            num_workers=args.workers, pin_memory=True, sampler=val_sampler, drop_last=True)
    
    elif mode == 'test':
        data_loader = data.DataLoader(
            dataset, batch_size=1, shuffle=True,
            num_workers=args.workers, pin_memory=True)
    print('"%s" dataset size: %d' % (mode, len(dataset)))
    return data_loader
Ejemplo n.º 23
0
    def __init__(self,
                 dataset: data.Dataset,
                 mask: bool,
                 batch_size: int,
                 initial_temperature: float,
                 drop_last: bool = False,
                 device='cpu'):
        super().__init__()
        self._dataset = dataset
        self.mask = mask
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.device = device

        self._temperature = initial_temperature

        # * TODO: A better than random sampler that take into account sample length
        self._sampler = data.BatchSampler(data.RandomSampler(
            self._dataset, replacement=False),
            batch_size=self.batch_size,
            drop_last=self.drop_last)
Ejemplo n.º 24
0
 def _setup_dataloader_from_config(self, cfg: DictConfig):
     if cfg.get("load_from_cached_dataset", False):
         logging.info('Loading from cached dataset %s' %
                      (cfg.src_file_name))
         if cfg.src_file_name != cfg.tgt_file_name:
             raise ValueError(
                 "src must be equal to target for cached dataset")
         dataset = pickle.load(open(cfg.src_file_name, 'rb'))
         dataset.reverse_lang_direction = cfg.get("reverse_lang_direction",
                                                  False)
     else:
         dataset = TranslationDataset(
             dataset_src=str(Path(cfg.src_file_name).expanduser()),
             dataset_tgt=str(Path(cfg.tgt_file_name).expanduser()),
             tokens_in_batch=cfg.tokens_in_batch,
             clean=cfg.get("clean", False),
             max_seq_length=cfg.get("max_seq_length", 512),
             min_seq_length=cfg.get("min_seq_length", 1),
             max_seq_length_diff=cfg.get("max_seq_length_diff", 512),
             max_seq_length_ratio=cfg.get("max_seq_length_ratio", 512),
             cache_ids=cfg.get("cache_ids", False),
             cache_data_per_node=cfg.get("cache_data_per_node", False),
             use_cache=cfg.get("use_cache", False),
             reverse_lang_direction=cfg.get("reverse_lang_direction",
                                            False),
         )
         dataset.batchify(self.encoder_tokenizer, self.decoder_tokenizer)
     if cfg.shuffle:
         sampler = pt_data.RandomSampler(dataset)
     else:
         sampler = pt_data.SequentialSampler(dataset)
     return torch.utils.data.DataLoader(
         dataset=dataset,
         batch_size=1,
         sampler=sampler,
         num_workers=cfg.get("num_workers", 2),
         pin_memory=cfg.get("pin_memory", False),
         drop_last=cfg.get("drop_last", False),
     )
Ejemplo n.º 25
0
 def get_data_loader(self, examples, args):
     features_0 = bert.convert_examples_to_features(
         [x[0] for x in examples], self.get_labels(),
         args.max_seq_length, self.tokenizer)
     features_1 = bert.convert_examples_to_features(
         [x[1] for x in examples], self.get_labels(),
         args.max_seq_length, self.tokenizer)
     features = list(zip(features_0, features_1))
     input_ids_0 = torch.tensor([f[0].input_ids for f in features],
                                dtype=torch.long)
     input_mask_0 = torch.tensor([f[0].input_mask for f in features],
                                 dtype=torch.long)
     segment_ids_0 = torch.tensor([f[0].segment_ids for f in features],
                                  dtype=torch.long)
     input_ids_1 = torch.tensor([f[1].input_ids for f in features],
                                dtype=torch.long)
     input_mask_1 = torch.tensor([f[1].input_mask for f in features],
                                 dtype=torch.long)
     segment_ids_1 = torch.tensor([f[1].segment_ids for f in features],
                                  dtype=torch.long)
     label_ids = torch.tensor([f[0].label_id for f in features],
                              dtype=torch.long)
     ids = [x[0].guid for x in examples]
     tensors = td.TensorDataset(
         input_ids_0, input_mask_0, segment_ids_0,
         input_ids_1, input_mask_1, segment_ids_1,
         label_ids)
     train_data = ARCTDataset(ids, tensors)
     if args.local_rank == -1:
         train_sampler = td.RandomSampler(train_data)
     else:
         train_sampler = td.DistributedSampler(train_data)
     data_loader = td.DataLoader(
         dataset=train_data,
         sampler=train_sampler,
         batch_size=args.train_batch_size,
         collate_fn=collate)
     return data_loader
Ejemplo n.º 26
0
def verka_300w_w2_boot(enc):

    sum_loss = 0
    n = len(LazyLoader.w300().test_dataset)
    loader = torch_data.DataLoader(LazyLoader.w300().test_dataset,
                                   batch_size=16,
                                   drop_last=False,
                                   sampler=torch_data.RandomSampler(
                                       LazyLoader.w300().test_dataset,
                                       replacement=True,
                                       num_samples=n),
                                   num_workers=20)

    for i, batch in enumerate(loader):
        data = batch['data'].cuda()
        landmarks = batch["meta"]["keypts_normalized"].cuda()
        pred = enc(data)["mes"].coord
        eye_dist = landmarks[:, 45] - landmarks[:, 36]
        eye_dist = eye_dist.pow(2).sum(dim=1).sqrt()
        sum_loss += (OTWasDist().forward(pred, landmarks) /
                     eye_dist).sum().item()
    # print("test brule_loss: ", sum_loss / n)
    return sum_loss / n
Ejemplo n.º 27
0
def create_data_loaders(train_dataset: data.Dataset, val_dataset: data.Dataset,
                        num_workers: int, batch_size: int):
    logging.info(
        f'creating dataloaders with {num_workers} workers and a batch-size of {batch_size}'
    )
    fn_dataloader = functools.partial(
        data.DataLoader,
        batch_size=batch_size,
        num_workers=num_workers,
        collate_fn=collate_fn,
        pin_memory=True,
    )

    train_loader = fn_dataloader(train_dataset, shuffle=True)

    train_metrics_sampler = data.RandomSampler(train_dataset,
                                               replacement=True,
                                               num_samples=len(val_dataset))
    train_metrics_loader = fn_dataloader(train_dataset,
                                         sampler=train_metrics_sampler)

    val_metrics_loader = fn_dataloader(val_dataset)

    return train_loader, train_metrics_loader, val_metrics_loader
def main(args):
    utils.init_distributed_mode(args)

    device = torch.device(args.gpus)

    in_chns = 3
    if args.vision_type == 'monochromat':
        in_chns = 1
    elif 'dichromat' in args.vision_type:
        in_chns = 2
    data_reading_kwargs = {
        'target_size': args.target_size,
        'colour_vision': args.vision_type,
        'colour_space': args.colour_space
    }
    dataset, num_classes = utils.get_dataset(args.dataset, args.data_dir,
                                             'train', **data_reading_kwargs)

    json_file_name = os.path.join(args.out_dir, 'args.json')
    with open(json_file_name, 'w') as fp:
        json.dump(dict(args._get_kwargs()), fp, sort_keys=True, indent=4)

    dataset_test, _ = utils.get_dataset(args.dataset, args.data_dir, 'val',
                                        **data_reading_kwargs)

    if args.distributed:
        train_sampler = torch_dist.DistributedSampler(dataset)
        test_sampler = torch_dist.DistributedSampler(dataset_test)
    else:
        train_sampler = torch_data.RandomSampler(dataset)
        test_sampler = torch_data.SequentialSampler(dataset_test)

    data_loader = torch_data.DataLoader(dataset,
                                        batch_size=args.batch_size,
                                        sampler=train_sampler,
                                        num_workers=args.workers,
                                        collate_fn=utils.collate_fn,
                                        drop_last=True)

    data_loader_test = torch_data.DataLoader(dataset_test,
                                             batch_size=1,
                                             sampler=test_sampler,
                                             num_workers=args.workers,
                                             collate_fn=utils.collate_fn)

    if args.network_name == 'unet':
        model = segmentation_models.unet.model.Unet(
            encoder_weights=args.backbone, classes=num_classes)
        if args.pretrained:
            print('Loading %s' % args.pretrained)
            checkpoint = torch.load(args.pretrained, map_location='cpu')
            remove_keys = []
            for key_ind, key in enumerate(checkpoint['state_dict'].keys()):
                if 'segmentation_head' in key:
                    remove_keys.append(key)
            for key in remove_keys:
                del checkpoint['state_dict'][key]
            model.load_state_dict(checkpoint['state_dict'], strict=False)
    elif args.custom_arch:
        print('Custom model!')
        backbone_name, customs = model_utils.create_custom_resnet(
            args.backbone, None)
        if customs is not None:
            args.backbone = {'arch': backbone_name, 'customs': customs}

        model = custom_models.__dict__[args.network_name](
            args.backbone, num_classes=num_classes, aux_loss=args.aux_loss)

        if args.pretrained:
            print('Loading %s' % args.pretrained)
            checkpoint = torch.load(args.pretrained, map_location='cpu')
            num_all_keys = len(checkpoint['state_dict'].keys())
            remove_keys = []
            for key_ind, key in enumerate(checkpoint['state_dict'].keys()):
                if key_ind > (num_all_keys - 3):
                    remove_keys.append(key)
            for key in remove_keys:
                del checkpoint['state_dict'][key]
            pretrained_weights = OrderedDict(
                (k.replace('segmentation_model.', ''), v)
                for k, v in checkpoint['state_dict'].items())
            model.load_state_dict(pretrained_weights, strict=False)
    else:
        model = seg_models.__dict__[args.network_name](
            num_classes=num_classes,
            aux_loss=args.aux_loss,
            pretrained=args.pretrained)
    model.to(device)
    if args.distributed:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    best_iou = 0
    model_progress = []
    model_progress_path = os.path.join(args.out_dir, 'model_progress.csv')
    # loading the model if to eb resumed
    if args.resume is not None:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        best_iou = checkpoint['best_iou']
        # if model progress exists, load it
        if os.path.exists(model_progress_path):
            model_progress = np.loadtxt(model_progress_path, delimiter=',')
            model_progress = model_progress.tolist()

    master_model = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpus])
        master_model = model.module

    if args.network_name == 'unet':
        params_to_optimize = model.parameters()
    else:
        params_to_optimize = [
            {
                'params': [
                    p for p in master_model.backbone.parameters()
                    if p.requires_grad
                ]
            },
            {
                'params': [
                    p for p in master_model.classifier.parameters()
                    if p.requires_grad
                ]
            },
        ]
        if args.aux_loss:
            params = [
                p for p in master_model.aux_classifier.parameters()
                if p.requires_grad
            ]
            params_to_optimize.append({'params': params, 'lr': args.lr * 10})
    optimizer = torch.optim.SGD(params_to_optimize,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    lr_lambda = lambda x: (1 - x / (len(data_loader) * args.epochs))**0.9
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    criterion = select_criterion(args.dataset)

    start_time = time.time()
    for epoch in range(args.initial_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        train_log = train_one_epoch(model, criterion, optimizer, data_loader,
                                    lr_scheduler, device, epoch,
                                    args.print_freq)
        val_confmat = utils.evaluate(model,
                                     data_loader_test,
                                     device=device,
                                     num_classes=num_classes)
        val_log = val_confmat.get_log_dict()
        is_best = val_log['iou'] > best_iou
        best_iou = max(best_iou, val_log['iou'])
        model_data = {
            'epoch': epoch + 1,
            'arch': args.network_name,
            'customs': {
                'aux_loss': args.aux_loss,
                'pooling_type': args.pooling_type,
                'in_chns': in_chns,
                'num_classes': num_classes,
                'backbone': args.backbone
            },
            'state_dict': master_model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'target_size': args.target_size,
            'args': args,
            'best_iou': best_iou,
        }
        utils.save_on_master(model_data,
                             os.path.join(args.out_dir, 'checkpoint.pth'))
        if is_best:
            utils.save_on_master(model_data,
                                 os.path.join(args.out_dir, 'model_best.pth'))

        epoch_prog, header = add_to_progress(train_log, [], '')
        epoch_prog, header = add_to_progress(val_log,
                                             epoch_prog,
                                             header,
                                             prefix='v_')
        model_progress.append(epoch_prog)
        np.savetxt(model_progress_path,
                   np.array(model_progress),
                   delimiter=';',
                   header=header,
                   fmt='%s')

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Ejemplo n.º 29
0
    def _setup_dataloader_from_config(self, cfg: DictConfig, predict_last_k=0):

        if cfg.get("use_tarred_dataset", False):
            if cfg.get("metadata_file") is None:
                raise FileNotFoundError(
                    "Trying to use tarred data set but could not find metadata path in config."
                )
            else:
                metadata_file = cfg.get('metadata_file')
                with open(metadata_file) as metadata_reader:
                    metadata = json.load(metadata_reader)
                if cfg.get('tar_files') is None:
                    tar_files = metadata.get('tar_files')
                    if tar_files is not None:
                        logging.info(
                            f'Loading from tarred dataset {tar_files}')
                    else:
                        raise FileNotFoundError(
                            "Could not find tarred dataset in config or metadata."
                        )
                else:
                    tar_files = cfg.get('tar_files')
                    if metadata.get('tar_files') is not None:
                        raise ValueError(
                            'Tar files specified in config and in metadata file. Tar files should only be specified once.'
                        )
            dataset = TarredSentenceDataset(
                text_tar_filepaths=tar_files,
                metadata_path=metadata_file,
                tokenizer=self.tokenizer,
                shuffle_n=cfg.get("tar_shuffle_n", 100),
                shard_strategy=cfg.get("shard_strategy", "scatter"),
                global_rank=self.global_rank,
                world_size=self.world_size,
            )
            return torch.utils.data.DataLoader(
                dataset=dataset,
                batch_size=1,
                num_workers=cfg.get("num_workers", 2),
                pin_memory=cfg.get("pin_memory", False),
                drop_last=cfg.get("drop_last", False),
            )
        else:
            dataset = SentenceDataset(
                tokenizer=self.tokenizer,
                dataset=cfg.file_name,
                tokens_in_batch=cfg.tokens_in_batch,
                clean=cfg.get("clean", False),
                max_seq_length=cfg.get("max_seq_length", 512),
                min_seq_length=cfg.get("min_seq_length", 1),
                cache_ids=cfg.get("cache_ids", False),
            )
        if cfg.shuffle:
            sampler = pt_data.RandomSampler(dataset)
        else:
            sampler = pt_data.SequentialSampler(dataset)
        return torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size=1,
            sampler=sampler,
            num_workers=cfg.get("num_workers", 2),
            pin_memory=cfg.get("pin_memory", False),
            drop_last=cfg.get("drop_last", False),
        )
Ejemplo n.º 30
0
    print('System start to load data...')
    t0 = time()
    train_data, val_data = data_utils.load_all()
    t1 = time()
    print('Data has been loaded successfully, cost:%.4fs' % (t1 - t0))

    ########################### TRAINING STAGE ##################################
    check_dir('%s/train_log' % conf.out_path)
    log = Logging('%s/train_%s_nrms.log' % (conf.out_path, conf.data_name))
    train_model_path = '%s/train_%s_nrms.mod' % (conf.out_path, conf.data_name)

    # prepare data for the training stage
    train_dataset = data_utils.TrainData(train_data)
    val_dataset = data_utils.TestData(val_data)

    train_batch_sampler = data.BatchSampler(data.RandomSampler(
        range(train_dataset.length)), batch_size=conf.batch_size, drop_last=False)
    val_batch_sampler = data.BatchSampler(data.SequentialSampler(
        range(val_dataset.length)), batch_size=conf.batch_size, drop_last=True)

    # Start Training !!!
    max_auc = 0
    for epoch in range(1, conf.train_epochs+1):
        t0 = time()
        model.train()
        
        train_loss = []
        count = 0
        for batch_idx_list in train_batch_sampler:
            
            his_input_title, pred_input_title, labels = \
                train_dataset._get_batch(batch_idx_list)