def prepare_senti_data(hps, vocab): print('preparing senti data...') dataset = DisDataset(hps.senti_train_data_path, hps.senti_train_label_path, vocab, debug=False) weights = make_weights_for_balanced_classes(dataset.ls, 2, PosOverNeg=1) sampler = WeightedRandomSampler(weights, len(weights)) train_data_loader = DataLoader(dataset,\ batch_size=hps.senti_batch_size,\ shuffle=False,\ collate_fn=collate_fn, drop_last=False, sampler=sampler) dataset = DisDataset(hps.senti_dev_data_path, hps.senti_dev_label_path, vocab, debug=False) weights = make_weights_for_balanced_classes(dataset.ls, 2, PosOverNeg=1) sampler = WeightedRandomSampler(weights, len(weights)) dev_data_loader = DataLoader(dataset,\ batch_size=hps.senti_batch_size,\ shuffle=False,\ collate_fn=collate_fn, drop_last=False, sampler=sampler) return train_data_loader, dev_data_loader
def load_dataset(args, INPUT_SIZE=[112, 112], RGB_MEAN=[0.5, 0.5, 0.5], RGB_STD=[0.5, 0.5, 0.5], val_datasets=[ 'lfw', 'cfp_ff', 'cfp_fp', 'agedb_30', 'calfw', 'cplfw', 'vgg2_fp' ]): train_transform = transforms.Compose([ transforms.Resize( [int(128 * INPUT_SIZE[0] / 112), int(128 * INPUT_SIZE[0] / 112)]), transforms.RandomCrop([INPUT_SIZE[0], INPUT_SIZE[1]]), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=RGB_MEAN, std=RGB_STD), ]) train_data = dset.ImageFolder( os.path.join(args.data_path, 'CASIA-maxpy-align'), train_transform) weights = torch.DoubleTensor( make_weights_for_balanced_classes(train_data.imgs, len(train_data.classes))) if args.distributed: from catalyst.data.sampler import DistributedSamplerWrapper train_sampler = DistributedSamplerWrapper( WeightedRandomSampler(weights, len(weights))) else: train_sampler = WeightedRandomSampler(weights, len(weights)) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize( [int(128 * INPUT_SIZE[0] / 112), int(128 * INPUT_SIZE[0] / 112)]), transforms.CenterCrop([INPUT_SIZE[0], INPUT_SIZE[1]]), transforms.ToTensor(), transforms.Normalize(mean=RGB_MEAN, std=RGB_STD) ]) val_loaders = [] for name in val_datasets: carray = bcolz.carray(rootdir=os.path.join(args.data_path, name), mode='r') val_data_tensor = torch.tensor(carray[:, [2, 1, 0], :, :]) * 0.5 + 0.5 val_data = TensorsDataset(val_data_tensor, val_transform) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=None) issame = np.load('{}/{}_list.npy'.format(args.data_path, name)) val_loaders.append((name, val_loader, issame)) return train_loader, val_loaders
def prepare_discriminator_data(hps, vocab): print('preparing dis data...') train_dataset = DisDataset(hps.dis_train_data_path, hps.dis_train_label_path, vocab, debug=False) train_weights = make_weights_for_balanced_classes(train_dataset.ls, 2, PosOverNeg=1) train_sampler = WeightedRandomSampler(train_weights, len(train_weights)) train_data_loader = DataLoader(train_dataset,\ batch_size=hps.dis_batch_size,\ shuffle=False,\ collate_fn=collate_fn, drop_last=False, sampler=train_sampler) dev_dataset = DisDataset(hps.dis_dev_data_path, hps.dis_dev_label_path, vocab, debug=False) dev_weights = make_weights_for_balanced_classes(dev_dataset.ls, 2, PosOverNeg=1) dev_sampler = WeightedRandomSampler(dev_weights, len(dev_weights)) dev_data_loader = DataLoader(dev_dataset,\ batch_size=hps.dis_batch_size,\ shuffle=False,\ collate_fn=collate_fn, drop_last=False, sampler=dev_sampler) return train_data_loader, dev_data_loader
def get_batch(self): # get sequence idx sampler = WeightedRandomSampler(self.memory['sum_priority'], self.batch_size) seq_idx = list(sampler) # get episode idx epi_idx = [] for seq in seq_idx: sampler = WeightedRandomSampler(self.memory['priority'][seq], 1) epi_idx.append(list(sampler)[0]) # get batch batch = {} for key in self.key_list: batch[key] = list() for key in self.key_list: for seq, epi in zip(seq_idx, epi_idx): start_v = self.overlap_size * epi end_v = self.overlap_size * epi + self.seq_size if end_v >= len(self.memory['reward'][seq]): # out of range end_v = len(self.memory['reward'][seq]) start_v = end_v - self.seq_size if key == 'state' or key == 'done': end_v += self.multi_step if key == 'state': state_list = [] for i in range(-3, 0): ss = start_v + i if ss < 0: ss = 0 state_list.append(self.memory[key][seq][ss]) state_list = np.array(state_list) state_list = np.concatenate( (state_list, self.memory[key][seq][start_v:end_v]), axis=0) batch[key].append(state_list) elif key == 'recc' or key == 'target_recc': batch[key].append(self.memory[key][seq][start_v]) else: batch[key].append(self.memory[key][seq][start_v:end_v]) batch[key] = np.array(batch[key]) p = [ self.memory['priority'][seq][epi] for seq, epi in zip(seq_idx, epi_idx) ] / self.sum_p weights = (self.N * p)**(-self.importance_exp) weights /= np.max(weights) return batch, seq_idx, epi_idx, weights
def load_data(train_data, val_data, test_data, opts): """Creates training and test data loaders. """ train_dataset = KeystrokeDataset(train_data, opts) val_dataset = KeystrokeDataset(val_data, opts) test_dataset = KeystrokeDataset(test_data, opts) train_sampler = WeightedRandomSampler([ 1 / train_dataset.class_counts[sample[1]] for sample in train_dataset.samples ], len(train_dataset.samples)) val_sampler = WeightedRandomSampler([ 1 / val_dataset.class_counts[sample[1]] for sample in val_dataset.samples ], len(val_dataset.samples)) test_sampler = WeightedRandomSampler([ 1 / test_dataset.class_counts[sample[1]] for sample in test_dataset.samples ], len(test_dataset.samples)) if opts.balance_classes: print("Using WeightedRandomSampler") train_dloader = DataLoader(dataset=train_dataset, batch_size=min(1, len(train_dataset)), sampler=train_sampler, num_workers=opts.num_workers) val_dloader = DataLoader(dataset=val_dataset, batch_size=min(1, len(val_dataset)), sampler=val_sampler, num_workers=opts.num_workers) test_dloader = DataLoader(dataset=test_dataset, batch_size=min(1, len(test_dataset)), sampler=test_sampler, num_workers=opts.num_workers) else: print("Not weighting classes") train_dloader = DataLoader(dataset=train_dataset, batch_size=min(1, len(train_dataset)), shuffle=True, num_workers=opts.num_workers) val_dloader = DataLoader(dataset=val_dataset, batch_size=min(1, len(val_dataset)), shuffle=True, num_workers=opts.num_workers) test_dloader = DataLoader(dataset=test_dataset, batch_size=min(1, len(test_dataset)), shuffle=True, num_workers=opts.num_workers) return train_dataset.channels, len( train_dataset.class_counts), train_dloader, val_dloader, test_dloader
def __init__(self, data_source): lebel_freq = {} for idx in range(len(data_source)): label = data_source.items[idx]['language'] if label in lebel_freq: lebel_freq[label] += 1 else: lebel_freq[label] = 1 total = float(sum(lebel_freq.values())) weights = [ total / lebel_freq[data_source.items[idx]['language']] for idx in range(len(data_source)) ] self._sampler = WeightedRandomSampler(weights, len(weights))
def get_batch(self): # get sequence idx sampler = WeightedRandomSampler(self.memory['sum_priority'], self.batch_size) seq_idx = list(sampler) # get episode idx epi_idx = [] for seq in seq_idx: sampler = WeightedRandomSampler(self.memory['priority'][seq], 1) epi_idx.append(list(sampler)[0]) # get batch batch = {} for key in self.key_list + ['next_state']: batch[key] = list() for seq, epi in zip(seq_idx, epi_idx): next_idx = epi + self.multi_step + 1 for key in ['state']: state_list = [] for i in range(-3, 1): # -3 ~ 0 ss = epi + i if ss < 0: ss = 0 state_list.append(self.memory[key][seq][ss]) state_list = np.concatenate(state_list, axis=0) batch[key].append(state_list) batch['action'].append(self.memory['action'][seq][epi]) batch['reward'].append(self.memory['reward'][seq][epi]) batch['next_state'].append( np.concatenate(self.memory['state'][seq][next_idx - 4:next_idx], axis=0)) batch['done'].append(self.memory['done'][seq][next_idx - 1]) for key in self.key_list + ['next_state']: batch[key] = np.array(batch[key]) p = [ self.memory['priority'][seq][epi] for seq, epi in zip(seq_idx, epi_idx) ] / self.sum_p weights = (self.N * p)**(-self.importance_exp) weights /= np.max(weights) return batch, seq_idx, epi_idx, weights
def _setup_sampler(sampler_type, num_iters, batch_size): if sampler_type is None: return None if sampler_type == "weighted": from torch.utils.data.sampler import WeightedRandomSampler w = torch.ones(num_iters * batch_size, dtype=torch.float) for i in range(num_iters): w[batch_size * i:batch_size * (i + 1)] += i * 1.0 return WeightedRandomSampler(w, num_samples=num_iters * batch_size, replacement=True) if sampler_type == "distributed": from torch.utils.data.distributed import DistributedSampler import torch.distributed as dist num_replicas = 1 rank = 0 if dist.is_available() and dist.is_initialized(): num_replicas = dist.get_world_size() rank = dist.get_rank() dataset = torch.zeros(num_iters * batch_size) return DistributedSampler(dataset, num_replicas=num_replicas, rank=rank)
def get_dataloaders(self, batch_size=16, n_workers=4): # note: no data augmentation applied since all cards will be oriented the same way transform = CardTransforms(size=128) trainset = CardDataset(self.train_img_paths, self.train_labels, compute_weights=self.uniform_sampling, cardtransforms=transform) sampler = None if self.uniform_sampling: sampler = WeightedRandomSampler(trainset.sample_weights, len(trainset.sample_weights), replacement=True) print("created uniform sampler") trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=not self.uniform_sampling, sampler=sampler, num_workers=n_workers, pin_memory=True) valloader = None if self.val_split > 0.0: valset = CardDataset(self.val_img_paths, self.val_labels, compute_weights=False, cardtransforms=transform) valloader = DataLoader(valset, batch_size=batch_size, shuffle=False, sampler=None, num_workers=n_workers, pin_memory=True) return trainloader, valloader
def on_train_begin(self, **kwargs): self.old_dl = self.data.train_dl weights, n_samples = SampleWeights(self.data.train_dl, self.data.c, self.weights).by_class() sampler = WeightedRandomSampler(weights, n_samples) self.data.train_dl = self.data.train_dl.new(shuffle=False, sampler=sampler)
def retrain(): dataset = datasets.PixelLinkIC15Dataset(config.train_images_dir, config.train_labels_dir) sampler = WeightedRandomSampler([1 / len(dataset)] * len(dataset), config.batch_size, replacement=True) dataloader = DataLoader(dataset, batch_size=config.batch_size, sampler=sampler) my_net = net.Net() if config.gpu: device = torch.device("cuda:0") my_net = my_net.cuda() if config.multi_gpu: my_net = nn.DataParallel(my_net) else: device = torch.device("cpu") my_net.load_state_dict( torch.load(config.saving_model_dir + '%d.mdl' % config.retrain_model_index)) optimizer = optim.SGD(my_net.parameters(), lr=config.retrain_learning_rate2, \ momentum=config.momentum, weight_decay=config.weight_decay) optimizer2 = optim.SGD(my_net.parameters(), lr=config.retrain_learning_rate, \ momentum=config.momentum, weight_decay=config.weight_decay) train(config.retrain_epoch, config.retrain_model_index, dataloader, my_net, optimizer, optimizer2, device)
def LoadData(data, word2id, batch_size, use_weighted_sample=False): data_info = dict() data_keys = data[0].keys() for k in data_keys: data_info[k] = [] for pair in data: for k in data_keys: data_info[k].append(pair[k]) dataset = Dataset(data_info, word2id, sequicity=0) if use_weighted_sample == True: weights = [ 1 if data["gating_label"] == [0] * len(ALL_SLOTS) else 9 for data in dataset ] from torch.utils.data.sampler import WeightedRandomSampler sampler = WeightedRandomSampler(weights, num_samples=len(dataset), replacement=True) else: sampler = None data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, sampler=sampler, collate_fn=collate_fn) #data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True) return data_loader
def get_loader(dataset, train, reweight_groups, **kwargs): if not train: # Validation or testing assert reweight_groups is None shuffle = False sampler = None elif not reweight_groups: # Training but not reweighting shuffle = True sampler = None else: # Training and reweighting # When the --robust flag is not set, reweighting changes the loss function # from the normal ERM (average loss over each training example) # to a reweighted ERM (weighted average where each (y,c) group has equal weight) . # When the --robust flag is set, reweighting does not change the loss function # since the minibatch is only used for mean gradient estimation for each group separately group_weights = len(dataset) / dataset._group_counts weights = group_weights[dataset._group_array] # Replacement needs to be set to True, otherwise we'll run out of minority samples sampler = WeightedRandomSampler(weights, len(dataset), replacement=True) shuffle = False # assert shuffle == False loader = DataLoader(dataset, shuffle=shuffle, sampler=sampler, **kwargs) return loader
def test_dist_proxy_sampler(): weights = torch.ones(100) weights[:50] += 1 num_samples = 200 sampler = WeightedRandomSampler(weights, num_samples) num_replicas = 8 dist_samplers = [DistributedProxySampler(sampler, num_replicas=num_replicas, rank=i) for i in range(num_replicas)] for seed in range(100): torch.manual_seed(seed) true_indices = list(sampler) indices_per_rank = [] for s in dist_samplers: s.set_epoch(seed) indices_per_rank += list(s) set_indices_per_rank = set(indices_per_rank) set_true_indices = set(true_indices) assert ( set_indices_per_rank == set_true_indices ), f"{set_true_indices - set_indices_per_rank} | {set_indices_per_rank - set_true_indices}" with pytest.raises(TypeError, match=r"Argument sampler should be instance of torch Sampler"): DistributedProxySampler(None) with pytest.raises(TypeError, match=r"Argument sampler should have length"): DistributedProxySampler(Sampler([1]))
def set_ml_dataloader(dataset, phase, cfg, shuffle=False): if phase in ['test', 'infer']: dataloader = WrapperDataLoader(dataset, batch_size=cfg.batch_size, num_workers=cfg.n_jobs, pin_memory=True, sampler=None, shuffle=False, drop_last=False) else: if sum(cfg.sample_balance) != 0.0: if cfg.task_type.value == 'classify': weights = make_weights_for_balanced_classes( dataset.get_labels(), cfg.sample_balance) else: weights = [torch.Tensor([1.0])] * len(dataset.get_labels()) sampler = WeightedRandomSampler(weights, int(len(dataset) * cfg.epoch_rate)) else: sampler = None dataloader = WrapperDataLoader(dataset, batch_size=len(dataset), num_workers=cfg.n_jobs, pin_memory=True, sampler=sampler, shuffle=shuffle) return dataloader
def test_fsd_train(self): val_prop = 0.2 # grouping_variables = ['label', 'manually_verified'] ##For Stratified Split and Sampling data_path = f'{self.data_base_dir}/Freesound/FSDKaggle2018.meta/train_post_competition.csv' dataset = FSDTrainDataset(data_path) samples = 1000 batch_size = 2 train, val = train_test_split(dataset.classes, test_size=val_prop, stratify=dataset.classes['factor']) dataset_train = Subset(dataset, train['idx']) dataset_train_weights = [1/(len(dataset.counts.keys())*dataset.counts[dataset.classes.loc[a]['factor']]) if a in train['idx'] else 0 for a in range(len(dataset))] train_sampler = WeightedRandomSampler( weights = dataset_train_weights, replacement=True, num_samples = samples) train_loader = DataLoader( dataset=dataset_train, batch_size = batch_size, sampler = train_sampler, # this is necessar for some reason pin_memory = True, collate_fn = dataset.collate_batch ) for i, (inputs, labels) in enumerate(train_loader): print(inputs) print(labels) break
def main(cfg): torch.multiprocessing.set_sharing_strategy('file_system') seed_torch(seed=cfg.seed) output_dir = os.path.join(cfg.output_dir, cfg.desc) if not os.path.exists(output_dir): os.makedirs(output_dir) train_dataset = build_dataset(cfg, phase='train') test_dataset = build_dataset(cfg, phase='test') if cfg.DATA.weighted_sample: train_dl = DataLoader(train_dataset, batch_size=32, sampler=WeightedRandomSampler( train_dataset.get_label_weight(), num_samples=5000), num_workers=0, drop_last=True) else: train_dl = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=16, drop_last=True) test_dl = DataLoader(test_dataset, batch_size=32, num_workers=8, drop_last=True) solver = Solver(cfg) solver.train(train_dl, test_dl)
def test_track2(self): val_prop = 0.2 grouping_variables = ['Covid_status', 'Gender'] ##For Stratified Split and Sampling data_path = f'{self.data_base_dir}/DiCOVA_Train_Val_Data_Release/metadata.csv' samples = 1000 batch_size = 2 data_path = f'{self.data_base_dir}/DiCOVA_Track_2_Release/metadata.csv' subdatasets = ['breathing-deep', 'counting-normal', 'vowel-e'] for subdataset in subdatasets: dataset = DiCOVATrack2(data_path, grouping_variables, subdataset) train, val = train_test_split(dataset.classes, test_size=val_prop, stratify=dataset.classes['factor']) dataset_train = Subset(dataset, train['idx']) dataset_train_weights = [1/(len(dataset.counts.keys())*dataset.counts[dataset.classes.loc[a]['factor']]) if a in train['idx'] else 0 for a in range(len(dataset))] train_sampler = WeightedRandomSampler( weights = dataset_train_weights, replacement=True, num_samples = samples) train_loader = DataLoader( dataset=dataset_train, batch_size = batch_size, sampler = train_sampler, # this is necessar for some reason pin_memory = True, collate_fn = dataset.collate_batch ) for i, (inputs, labels) in enumerate(train_loader): print(inputs) print(labels) break
def _get_data_loader(self, config, names=None): transformations = self.__transform_func() if names is None: train_names, val_names = self._split_data(self.__train_dir, config.train_percent) else: train_names, val_names = names loader = self.__get_loader(config) train_folder = ImageFolder(self.__label_file, self.__train_dir, train_names, transform=transformations['train'], loader=loader) val_folder = ImageFolder(self.__label_file, self.__train_dir, val_names, transform=transformations['val'], loader=loader) if not len(train_folder) or not len(val_folder): raise ValueError, 'One of the image folders contains zero data, train: %s, val: %s' % \ (len(train_folder), len(val_folder)) sampler = None if config.weigh_sample: sampler = WeightedRandomSampler(train_folder.weights, len(train_folder), replacement=True) train_loader = torch.utils.data.DataLoader(train_folder, batch_size=config.batch_size, shuffle=True, sampler=sampler, num_workers=config.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_folder, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) return train_loader, val_loader
def set_loader(self, trn_ds, val_ds, batch): if self.cfg.weighed_sampler: targets = torch.from_numpy(trn_ds.data.label.values) class_sample_count = torch.tensor( [(targets == t).sum() for t in torch.unique(targets, sorted=True)]) weight = 1. / class_sample_count.float() samples_weight = torch.tensor([weight[t] for t in targets]) sampler = WeightedRandomSampler(samples_weight.double(), len(samples_weight)) shuffle = False else: sampler = None shuffle = True logging.info("Dataloader Setting...") self.trn_dl = DataLoader( trn_ds, batch_size=batch, shuffle=shuffle, num_workers=4, sampler=sampler, pin_memory = True ) self.val_dl = DataLoader( val_ds, batch_size=batch, shuffle=False, num_workers=4, pin_memory = True ) logging.info("Done.\n")
def get_counts(density_matrix, num_shots, target, print_results=False): dm_diag = torch.diagonal(density_matrix, 0) special_qubit = torch.diagonal(density_matrix, 0)[len(dm_diag) // 2 - 1:len(dm_diag) // 2 + 1] number_of_qubits = density_matrix.shape[1].bit_length() - 1 sampler = WeightedRandomSampler(special_qubit, num_shots) format = '{0:0' + str(density_matrix.shape[1].bit_length() - 1) + 'b}' counter = Counter() for idx in sampler: counter[idx] += 1 counter_s = sorted(counter.items()) # m_counts = counter_s[0][1] - counter_s[1][1] m_counts = counter[0] - counter[1] return m_counts / num_shots if print_results: print('{') for element in sorted(counter.items()): print("\t\"" + format.format(element[0]) + '\" : ' + str(element[1])) print('}') return counter
def test_dist_proxy_sampler(): import torch from torch.utils.data import WeightedRandomSampler weights = torch.ones(100) weights[:50] += 1 num_samples = 200 sampler = WeightedRandomSampler(weights, num_samples) num_replicas = 8 dist_samplers = [ DistributedProxySampler(sampler, num_replicas=num_replicas, rank=i) for i in range(num_replicas) ] for seed in range(100): torch.manual_seed(seed) true_indices = list(sampler) indices_per_rank = [] for s in dist_samplers: s.set_epoch(seed) indices_per_rank += list(s) set_indices_per_rank = set(indices_per_rank) set_true_indices = set(true_indices) assert set_indices_per_rank == set_true_indices, "{} | {}".format( set_true_indices - set_indices_per_rank, set_indices_per_rank - set_true_indices)
def setup_db_dl(train_batch_size=4, test_batch_size=4, get_data=get_data): full_dataset = DBDataset(TrainImage, MathSymbol, get_data, get_label, get_class_name, filter=valid_func) test_train_split = 0.9 train_size = int(test_train_split * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size]) weights = torch.zeros(len(train_dataset)) for i, data in enumerate(train_dataset): weights[i] = 1. / (math.log(full_dataset.class_counts[data[1]]) + 1.0) sampler = WeightedRandomSampler(weights, len(weights)) dataloaders = { "train": DataLoader(train_dataset, batch_size=train_batch_size, num_workers=1, sampler=sampler), "test": DataLoader(test_dataset, batch_size=test_batch_size, shuffle=True, num_workers=1) } return dataloaders, full_dataset
def __init__(self, dataset, batch_size, shuffle, num_workers, val_split=0.0, weights=None): self.shuffle = shuffle self.dataset = dataset self.nbr_examples = len(dataset) if weights is not None: self.train_sampler = WeightedRandomSampler( weights, num_samples=self.nbr_examples, replacement=True) self.val_sampler = None self.shuffle = False elif val_split: self.train_sampler, self.val_sampler = self._split_sampler( val_split) else: self.train_sampler, self.val_sampler = None, None self.init_kwargs = { 'dataset': self.dataset, 'batch_size': batch_size, 'shuffle': self.shuffle, 'num_workers': num_workers, 'pin_memory': True, 'drop_last': True } super(BaseDataLoader, self).__init__(sampler=self.train_sampler, **self.init_kwargs)
def detection_loader(dataset, train=True, batch_size=None): """ Creates a training set dataloader for a dataset. Uses a sampler to load the same number of datapoints from both classes. The dataloader returns shuffled data, as a sampler is used to balance the classes (on average, half the samples seen will be quotes and the other half won't be). :param dataset: QuoteDetectionDataset The dataset used from which to load data :param train: boolean Whether to load a training or testing dataloader. :param batch_size: int. The batch size. The default is None, where the batch size is the size of the data. :return: torch.utils.data.DataLoader DataLoader for quote detection. """ if batch_size is None: batch_size = len(dataset) if train: weights, num_samples = sampler_weights(dataset) sampler = WeightedRandomSampler(weights=weights, num_samples=num_samples, replacement=True) return DataLoader(dataset=dataset, batch_size=batch_size, sampler=sampler) else: return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False)
def train_dataloader(self): if self.sampler: targets = [] for target in self.targets_sampler: targets.append(target) targets = torch.tensor(targets).type(torch.long) # Compute samples weight (each sample should get its own weight) class_sample_count = torch.tensor([ (targets == t).sum() for t in torch.unique(targets, sorted=True) ]) weight = 1. / class_sample_count.float() samples_weight = torch.tensor([weight[t] for t in targets]) # Create sampler, dataset, loader sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) shuffle = False else: shuffle = True sampler = None shuffle = False if self.overfit else True return DataLoader(self.train_dataset, sampler=sampler, batch_size=self.batch_size, shuffle=shuffle, num_workers=cpu_count(), collate_fn=self.my_collate)
def main(): # loading data dataset = datasets.PixelLinkIC15Dataset(opt.train_images_dir, opt.train_labels_dir) sampler = WeightedRandomSampler([1 / len(dataset)] * len(dataset), opt.batch_size, replacement=True) dataloader = DataLoader(dataset, batch_size=opt.batch_size, sampler=sampler) my_net = net.Net() # construct neural network # choose gpu or cpu if opt.gpu: device = torch.device("cuda:0") my_net = my_net.cuda() if opt.multi_gpu: my_net = nn.DataParallel(my_net) else: device = torch.device("cpu") # train, optimize my_net.apply(weight_init) optimizer = optim.SGD(my_net.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) optimizer2 = optim.SGD(my_net.parameters(), lr=opt.learning_rate2, momentum=opt.momentum, weight_decay=opt.weight_decay) iteration = 0 train(opt.epoch, iteration, dataloader, my_net, optimizer, optimizer2, device)
def dataloader(cfg, mode): if mode == 'train': transforms = T.Compose([ T.ToPILImage(), T.RandomHorizontalFlip(0.5), T.RandomVerticalFlip(0.5), T.RandomRotation(90), T.ToTensor(), T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) dataset = Digestpath_fixed_list(cfg,mode,transforms) sampler = WeightedRandomSampler(dataset.weights,cfg['batch_size']*cfg['max_batch'],True) loader = torch.utils.data.DataLoader(dataset, batch_size=cfg['batch_size'], sampler=sampler, num_workers=cfg['num_workers']) else: transforms = T.Compose([ T.ToPILImage(), T.ToTensor(), T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) dataset = Digestpath_fixed_list(cfg,mode,transforms) if cfg['whole_image']: loader = dataset else: loader = torch.utils.data.DataLoader(dataset, batch_size=cfg['batch_size'],shuffle=True,num_workers=cfg['num_workers']) if mode == 'train': return loader, dataloader(cfg,'test') else: return loader
def build_dataloader_fuse(dataset, imgs_per_gpu, workers_per_gpu, num_gpus=1, drop_last=True, shuffle=True, dist=False, **kwargs): batch_size = num_gpus * imgs_per_gpu num_workers = num_gpus * workers_per_gpu sampler = WeightedRandomSampler(dataset.density, len(dataset)) print(f"Building dataloader with batch_size {batch_size}") if shuffle: data_loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, collate_fn=partial( collate, samples_per_gpu=imgs_per_gpu), pin_memory=False, drop_last=drop_last, **kwargs) else: data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=partial( collate, samples_per_gpu=imgs_per_gpu), pin_memory=False, drop_last=drop_last, **kwargs) return data_loader
def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): weights = None data_items = dataset.samples if getattr(config, "use_language_weighted_sampler", False): alpha = getattr(config, "language_weighted_sampler_alpha", 1.0) print(" > Using Language weighted sampler with alpha:", alpha) weights = get_language_balancer_weights(data_items) * alpha if getattr(config, "use_speaker_weighted_sampler", False): alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0) print(" > Using Speaker weighted sampler with alpha:", alpha) if weights is not None: weights += get_speaker_balancer_weights(data_items) * alpha else: weights = get_speaker_balancer_weights(data_items) * alpha if weights is not None: sampler = WeightedRandomSampler(weights, len(weights)) else: sampler = None # sampler for DDP if sampler is None: sampler = DistributedSampler(dataset) if num_gpus > 1 else None else: # If a sampler is already defined use this sampler and DDP sampler together sampler = DistributedSamplerWrapper( sampler) if num_gpus > 1 else sampler return sampler