def generate_sampler(dataset, sampler_option='random'): """ Returns sampler according to the wanted options :param dataset: (MRIDataset) the dataset to sample from :param sampler_option: (str) choice of sampler :return: (Sampler) """ df = dataset.df # To be changed for non-binary classification count = np.zeros(2) for idx in df.index: label = df.loc[idx, "diagnosis"] key = dataset.diagnosis_code[label] count[key] += 1 weight_per_class = 1 / np.array(count) weights = [] for idx, label in enumerate(df["diagnosis"].values): key = dataset.diagnosis_code[label] weights += [weight_per_class[key]] * dataset.elem_per_image if sampler_option == 'random': return sampler.RandomSampler(weights) elif sampler_option == 'weighted': return sampler.WeightedRandomSampler(weights, len(weights)) else: raise NotImplementedError( f"The option {sampler_option} for sampler is not implemented")
def reset(self): """ Two cases: 1. not hasattr(self, 'split_loader'): Resume from previous training. Create the dataset given the saved split_ix and iterator 2. wrapped: a new epoch, the split_ix and iterator have been updated in the get_minibatch_inds already. """ # batch_size is 0, the merge is done in DataLoader class #print('cpu count: %d'%(multiprocessing.cpu_count())) if self.split == 'train': split_sample_weights = self.dataloader.sample_weights[np.array( self.dataloader.split_ix[self.split])] mysample_ = sampler.WeightedRandomSampler( split_sample_weights, len(self.dataloader.split_ix[self.split])) mysample = np.array(self.dataloader.split_ix[self.split])[np.array( list(mysample_))] self.split_ix = mysample.tolist() else: self.split_ix = self.dataloader.split_ix[self.split] #np.save('data/tmp/%s_ix.npy'%(self.split), np.array(self.split_ix)) self.split_loader = iter( data.DataLoader( dataset=self.dataloader, batch_size=self.dataloader.batch_size, sampler=self.split_ix[self.dataloader.iterators[self.split]:], shuffle=False, pin_memory=False, num_workers=8, collate_fn=lambda x: x))
def generate_sampler(dataset, sampler_option="random", n_bins=5): df = dataset.df count = np.zeros(n_bins) values = df[dataset.label].values.astype(float) thresholds = [ min(values) + i * (max(values) - min(values)) / n_bins for i in range(n_bins) ] for idx in df.index: label = df.loc[idx, dataset.label] key = max(np.where((label >= thresholds))[0]) count[key] += 1 weight_per_class = 1 / np.array(count) weights = [] for idx, label in enumerate(df[dataset.label].values): key = max(np.where((label >= thresholds))[0]) weights += [weight_per_class[key]] * dataset.elem_per_image if sampler_option == "random": return sampler.RandomSampler(weights) elif sampler_option == "weighted": return sampler.WeightedRandomSampler(weights, len(weights)) else: raise NotImplementedError( f"The option {sampler_option} for sampler on regression task is not implemented" )
def get_dataloader_train(args, root, image_list): kwargs = {'num_workers': args.num_workers, 'pin_memory': args.pin_memory} dataset = ds.ClassificationDataset(root, image_list, transform=transforms.Compose([ transforms.RandomRotation(3), transforms.RandomResizedCrop(224, scale=(0.74, 0.78), ratio=(1.0, 1.0)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]), ) prob = np.zeros(args.num_classes) for i in range(len(dataset)): cur_class = dataset.labels[i] prob[cur_class]+=1 prob = 1.0 / prob reciprocal_weights = np.zeros(len(dataset)) epoch_length = args.epoch_length for i in range(len(dataset)): label = dataset.labels[i] reciprocal_weights[i] = prob[label] weights = torch.from_numpy(reciprocal_weights) weighted_sampler = sampler.WeightedRandomSampler(weights , epoch_length) loader = DataLoader(dataset, batch_size=args.batch_size, sampler=weighted_sampler, **kwargs) return loader
def generate_sampler(dataset, sampler_option="random", n_bins=5): df = dataset.df n_labels = df[dataset.label].nunique() count = np.zeros(n_labels) for idx in df.index: label = df.loc[idx, dataset.label] key = dataset.label_fn(label) count[key] += 1 weight_per_class = 1 / np.array(count) weights = [] for idx, label in enumerate(df[dataset.label].values): key = dataset.label_fn(label) weights += [weight_per_class[key]] * dataset.elem_per_image if sampler_option == "random": return sampler.RandomSampler(weights) elif sampler_option == "weighted": return sampler.WeightedRandomSampler(weights, len(weights)) else: raise NotImplementedError( f"The option {sampler_option} for sampler on classification task is not implemented" )
def get_train_loaders(path, device, batch_size, workers, class_count): def to_device(x, y): return x.to(device), y.to(device, dtype=torch.int64) train_dataset = extend_dataset( PickledDataset(path + '/train_gray.p', transform=get_train_transforms())) valid_dataset = PickledDataset(path + '/valid_gray.p', transform=get_test_transforms()) # Use weighted sampler class_sample_count = np.bincount(train_dataset.labels) weights = 1 / np.array( [class_sample_count[y] for y in train_dataset.labels]) samp = sampler.WeightedRandomSampler(weights, 43 * class_count) train_loader = WrappedDataLoader( DataLoader(train_dataset, batch_size=batch_size, sampler=samp, num_workers=workers), to_device) valid_loader = WrappedDataLoader( DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=workers), to_device) return train_loader, valid_loader
def make_loader(self, batch_size=16, num_workers=0, shuffle=False, pin_memory=False): """ Example: >>> torch.random.manual_seed(0) >>> dset = coco_api.CocoDataset(coco_api.demo_coco_data()) >>> self = YoloCocoDataset(dset, train=1) >>> loader = self.make_loader(batch_size=1) >>> train_iter = iter(loader) >>> # training batches should have multiple shapes >>> shapes = set() >>> for batch in train_iter: >>> shapes.add(batch[0].shape[-1]) >>> if len(shapes) > 1: >>> break >>> #assert len(shapes) > 1 >>> vali_loader = iter(loaders['vali']) >>> vali_iter = iter(loaders['vali']) >>> # vali batches should have one shape >>> shapes = set() >>> for batch, _ in zip(vali_iter, [1, 2, 3, 4]): >>> shapes.add(batch[0].shape[-1]) >>> assert len(shapes) == 1 """ assert len(self) > 0, 'must have some data' if shuffle: if True: # If the data is not balanced we need to balance it index_to_weight = self._training_sample_weights() num_samples = len(self) index_to_weight = index_to_weight[:num_samples] sampler = torch_sampler.WeightedRandomSampler(index_to_weight, num_samples, replacement=True) sampler.data_source = self # hack for use with multiscale else: sampler = torch_sampler.RandomSampler(self) resample_freq = 10 else: sampler = torch_sampler.SequentialSampler(self) resample_freq = None # use custom sampler that does multiscale training batch_sampler = multiscale_batch_sampler.MultiScaleBatchSampler( sampler, batch_size=batch_size, resample_freq=resample_freq, ) # torch.utils.data.sampler.WeightedRandomSampler loader = torch_data.DataLoader(self, batch_sampler=batch_sampler, collate_fn=nh.data.collate.padded_collate, num_workers=num_workers, pin_memory=pin_memory) if loader.batch_size != batch_size: try: loader.batch_size = batch_size except Exception: pass return loader
def sampler_c(dataset): from torch.utils.data import sampler train_targets = dataset.classes_for_all_imgs() # 配置类别采样比例 weights1 = torch.tensor([100, 0.01], dtype=torch.float) weights2 = torch.tensor([0.01, 100], dtype=torch.float) print(weights2) # 为每一个样本采得 采样权重 samples_weights1 = weights1[train_targets] samples_weights2 = weights2[train_targets] # 采样器 replacement=True-放回采样 sampler1 = sampler.WeightedRandomSampler(weights=samples_weights1, num_samples=3 * len(dataset), replacement=True) sampler2 = sampler.WeightedRandomSampler(weights=samples_weights2, num_samples=len(dataset), replacement=False) return sampler1, sampler2
def balanced_sampler(train_labels): '''create a training dataloader for unbalanced class counts''' # For an unbalanced dataset create a weighted sampler class_counts, train_samples_weights = make_weights_for_balanced_classes( train_labels) # Make a sampler to undersample classes with the highest counts return sampler.WeightedRandomSampler(train_samples_weights, len(train_samples_weights), replacement=True)
def sampler_c(dataset): from torch.utils.data import sampler train_targets = dataset.classes_for_all_imgs() # 配置类别采样比例 4:1 class_sample_couts = [4, 1] weights = torch.tensor(class_sample_couts, dtype=torch.float) # 为每一个样本采得 采样权重 samples_weights = weights[train_targets] # 采样器 replacement=True-放回采样 sampler = sampler.WeightedRandomSampler(weights=samples_weights, num_samples=len(dataset), replacement=True) return sampler
def uniform_sampler(dataset): nclasses = len(dataset.imgs) imgs_per_class = [0] * len(dataset.classes) for _, cl in dataset.imgs: imgs_per_class[cl] += 1 weight_per_img = [0] * nclasses for idx, (_, cl) in enumerate(dataset.imgs): weight_per_img[idx] = nclasses / imgs_per_class[cl] return sampler.WeightedRandomSampler(torch.DoubleTensor(weight_per_img), nclasses)
def sample_inference(self, token_seqs): softmax = nn.Softmax() batch_size = len(token_seqs) seq_len = len(token_seqs[0]) encoder_hidden = Variable(self.encoder.initHidden(batch_size)).double() encoder_output = Variable(self.encoder.initHidden(batch_size)).double() if use_cuda: encoder_hidden = encoder_hidden.cuda() encoder_output = encoder_output.cuda() hidden = (encoder_output, encoder_hidden) for i in np.arange(seq_len - 1, 0, -1): token_batch = np.array(self.embeddings[token_seqs[:, i]]) encoder_input = Variable(torch.from_numpy(token_batch)).view( 1, batch_size, -1).double() encoder_input = encoder_input.cuda() if use_cuda else encoder_input hidden = self.encoder(encoder_input, hidden) encoder_output, encoder_hidden = hidden token_batch = np.array(self.embeddings[[SOS_TOKEN] * batch_size]) decoder_output = Variable(self.decoder.initHidden(batch_size)).double() if use_cuda: decoder_output = decoder_output.cuda() hidden = (decoder_output, encoder_hidden) pred_seqs = None for i in range(250): decoder_input = Variable(torch.from_numpy(token_batch)).double() decoder_input = decoder_input.cuda() if use_cuda else decoder_input decoder_input = decoder_input.squeeze().view(1, batch_size, -1) decoder_output, hidden = self.decoder(decoder_input, hidden) output = softmax(decoder_output).data.cpu() output = output.numpy() output = output.tolist() out_iter = sampler.WeightedRandomSampler(output, len(output)) for idx in out_iter: ni = np.array([[idx]]) break if pred_seqs is None: pred_seqs = ni else: pred_seqs = np.concatenate((pred_seqs, ni), axis=1) token_batch = np.array(self.embeddings[ni]) return pred_seqs.tolist()
def train(self, path, verbose=True): self.data = SplitData(path, self.index) self.index = self.data.index_dict # new index from non-pretrained version train_set = self.data('train', random_crop_size=self.random_crop_size) if self.sample_amount: self.sampler = sampler.WeightedRandomSampler(torch.ones(len(train_set)), self.sample_amount) self.train_loader = DataLoader(train_set, batch_size=self.batchsize[0], sampler=self.sampler, num_workers=self.numloader) self.val_loader = DataLoader(self.data('val'), batch_size=self.batchsize[1], num_workers=self.numloader) start_time = time.time() for epoch in range(self.epochs): self.current_epoch = epoch epoch_time = time.time() train_loss = self.model_iter(self.train_loader) self.output_list = torch.empty([0]).to(device) # reset val_loss = self.model_iter(self.val_loader) epoch_time = time.time() - epoch_time # codes below are for printing if (epoch + 1) % 1 == 0 and verbose: print("Epoch {} finished, current validation loss is {:1.5f}, current train loss is {:1.5f}".format( epoch + 1, val_loss, train_loss, timeformat(epoch_time))) print('It takes {} from beginning'.format(timeformat(time.time() - start_time))) # codes below are for sustainability if val_loss < self.minimum_val_loss: self.minimum_val_loss = val_loss save_model(self.model, epoch + 1, train_loss, val_loss, self.index, self.optimizer.param_groups[0]['lr'], time.strftime('%dd-%Hh-%Mm', time.localtime(start_time))) # codes below are for visualization loss_dict = {'{}_loss'.format(self.model_name): {'train': train_loss, 'validation': val_loss}} visualize(loss_dict, epoch, mode='scalar_dict') img_dict = {'{} results'.format(self.model_name): self.output_list} if self.model_name == 'all': nrow = 10 elif self.model_name == 'spn': nrow = 8 else: nrow = 3 visualize(img_dict, epoch, mode='image', nrow=nrow) self.test_unknown('./test')
def test_wegith_sampler(): from torch.utils.data import sampler weight = list([ 1, ] * 30) weight[:10] = list([ 3, ] * 10) weight_sampler = sampler.WeightedRandomSampler(weight, num_samples=len(weight)) batch_sampler = sampler.BatchSampler(weight_sampler, batch_size=4, drop_last=False) for indices in batch_sampler: print(indices)
def init_dataloader(self, dset): class_sample_count = [len(c) for c in dset.classes] weights = 1 / torch.Tensor(class_sample_count) weights[~((weights + 1) != weights)] = 0 weight_per_sample = [0] * len(dset) for i in range(len(dset)): c, cind = dset.index_to_sample_d[i] weight_per_sample[i] = weights[c] self.trainsampler = sampler.WeightedRandomSampler( weight_per_sample, len(dset)) self.trainloader = DataLoader(dset, batch_size=self.opt.bSz, pin_memory=True, num_workers=self.opt.n_workers, sampler=self.trainsampler, drop_last=True)
def generate_sampler(dataset, sampler_option='random', step=1): """ Returns sampler according to the wanted options :param dataset: (MRIDataset) the dataset to sample from :param sampler_option: (str) choice of sampler :param step: (int) step to discretize ages and give a weight per class :return: (Sampler) """ df = dataset.df min_age = np.min(df.age) max_age = np.max(df.age) if (max_age - min_age) % step == 0: max_age += step bins = np.arange(min_age, max_age, step) count = np.zeros(len(bins)) for idx in df.index: age = df.loc[idx, "age"] key = np.argmax(np.logical_and(age - step < bins, age >= bins)).astype(int) count[key] += 1 # weight_per_class = (1 / np.array(count)) if count.any() != 0 else 0. weight_per_class = np.zeros_like(count).astype(float) np.divide(1., count, out=weight_per_class, where=count != 0) weights = [0] * len(df) for idx, age in enumerate(df.age.values): key = np.argmax(np.logical_and(age - 5 <= bins, age > bins)).astype(int) weights[idx] = weight_per_class[key] weights = torch.FloatTensor(weights) if sampler_option == 'random': s = sampler.RandomSampler(dataset, replacement=False) elif sampler_option == 'weighted': s = sampler.WeightedRandomSampler(weights, len(weights)) else: raise NotImplementedError( "The option %s for sampler is not implemented" % sampler_option) return s
def get_dataloader(dataset, balance_data, batch_size, num_workers, shuffle=True): if balance_data: weights = dataset.get_data_weights(balance_data) sampler_ = sampler.WeightedRandomSampler(weights, len(weights)) elif shuffle: sampler_ = sampler.RandomSampler(dataset) else: sampler_ = None dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=sampler_, num_workers=num_workers) return dataloader
def ClassificationDatasetTest(): root = '/media/shehabk/E_DRIVE/processed_db/expw/cropped_images/cropped_alligned_orig_256' image_list = '/media/shehabk/E_DRIVE/processed_db/expw/partition/seven_neutral_alligned_orig/train.txt' dataset = ClassificationDataset( root, image_list, transform=transforms.Compose([ transforms.RandomRotation(3), transforms.Resize((118, 100)), transforms.RandomCrop((112, 96)), # transforms.RandomResizedCrop(224, scale=(0.74, 0.78), # ratio=(1.0, 1.0)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])) # Balancing the classes !!!!!! prob = np.zeros(3) for i in range(len(dataset)): cur_class = dataset[i][1] prob[cur_class] += 1 prob = 1.0 / prob reciprocal_weights = np.zeros(len(dataset)) epoch_length = 2000 for i in range(len(dataset)): _, label = dataset[i] reciprocal_weights[i] = prob[label] weights = torch.from_numpy(reciprocal_weights) weighted_sampler = sampler.WeightedRandomSampler(weights, epoch_length) loader = DataLoader(dataset, batch_size=10, sampler=weighted_sampler) # loader = DataLoader(dataset, batch_size=5, shuffle=True) for i_batch, data in enumerate(loader): images, labels = data print(labels) out = utils.make_grid(images, nrow=1) imshow(out) # vs.imshow(out , mean = [0.524462] , std = [0.285962]) # vs.imshow(out, mean=[0.524462, 0.524462, 0.524462], std=[0.285962, 0.285962, 0.285962]) break
def train_dataloader(self): if self.hparams.weight_sample: spl = sampler.WeightedRandomSampler( self.train_ds.target_weights, int(len(self.train_ds) / self.hparams.epoch_cut), replacement=True ) return DataLoader( self.train_ds, batch_size=self.hparams.batch_size, sampler=spl, num_workers=self.hparams.num_workers, pin_memory=True, ) else: return DataLoader( self.train_ds, batch_size=self.hparams.batch_size, shuffle=true, num_workers=self.hparams.num_workers, pin_memory=True, )
def load_RM_data(path, batch_size, length, use_embedding, balanced_sampler=False): train = RMdata(path, use_embedding=use_embedding, length=length, mode='train') valid = RMdata(path, use_embedding=use_embedding, length=length, mode='valid') if not balanced_sampler: train_loader = DataLoader(dataset=train, batch_size=batch_size, shuffle=True) else: weights_train = make_weights_for_balanced_classes(train) # weights_valid = make_weights_for_balanced_classes(valid) weights_train = torch.cuda.DoubleTensor(weights_train) # weights_valid = torch.cuda.DoubleTensor(weights_valid) sampler_train = sampler.WeightedRandomSampler(weights_train, len(weights_train)) # sampler_valid = sampler.WeightedRandomSampler(weights_valid, len(weights_valid)) train_loader = DataLoader(dataset=train, batch_size=batch_size, sampler=sampler_train) # valid_loader = DataLoader(dataset=valid,batch_size=batch_size,sampler=sampler_valid) valid_loader = DataLoader(dataset=valid, batch_size=batch_size, shuffle=True) return train_loader, valid_loader
def dataloader(x_mal, x_ben): y_mal = np.ones(x_mal.shape[0]) y_ben = np.zeros(x_ben.shape[0]) x = np.concatenate([x_mal, x_ben]) y = np.concatenate([y_mal, y_ben]) class_sample_count = np.array( [len(np.where(y == t)[0]) for t in np.unique(y)]) weight = 1. / class_sample_count samples_weight = [] for t in range(len(y) - 1): samples_weight.append(weight[int(y[t])]) Sampler = sampler.WeightedRandomSampler(samples_weight, len(samples_weight)) data = TensorDataset(torch.from_numpy(x), torch.from_numpy(y)) data_loader = DataLoader(data, batch_size=batch_size, sampler=Sampler, drop_last=True) return data_loader # if __name__ == "__main__":
def make_train_valid_loaders(self, distributed=False ) -> Tuple[DataLoader, DataLoader]: train_dataset, valid_dataset = self.make_train_valid_datasets() train_weights = torch.DoubleTensor( [1.0] * len(train_dataset)) # uniform sampling train_sampler = sampler.WeightedRandomSampler( weights=train_weights, num_samples=self._data_params['batch_size'] * self._data_params['steps_per_epoch'], ) train_loader = self._make_loader(train_dataset, train_sampler, mode='train', distributed=distributed) valid_loader = self._make_loader( valid_dataset, sampler.SequentialSampler(valid_dataset), mode='valid', distributed=distributed, ) return train_loader, valid_loader
def get_dataloader_obj(data_dir, data_transforms, weights, num_samples, is_slr=False, data_types=['train', 'test', 'val'], bs=4): try: slr = sampler.WeightedRandomSampler(weights, num_samples) image_datasets = { x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in data_types } dataloaders = { x: DataLoader(image_datasets[x], batch_size=bs, shuffle=True, num_workers=bs) for x in data_types } if is_slr: dataloaders = { x: DataLoader(image_datasets[x], batch_size=bs, sampler=slr, num_workers=bs) for x in data_types } dsizes = {x: len(image_datasets[x]) for x in data_types} class_names = image_datasets['train'].classes dev = device("cuda:0" if cuda.is_available() else "cpu") return dataloaders, dsizes, class_names, dev except Exception as e: print(traceback.format_exc()) raise e
def main(train_root, train_csv, train_split, val_root, val_csv, val_split, epochs, aug, model_name, batch_size, num_workers, val_samples, early_stopping_patience, n_classes, weighted_loss, balanced_loader, _run): assert (model_name in ('inceptionv4', 'resnet152', 'densenet161', 'senet154')) AUGMENTED_IMAGES_DIR = os.path.join(fs_observer.dir, 'images') CHECKPOINTS_DIR = os.path.join(fs_observer.dir, 'checkpoints') BEST_MODEL_PATH = os.path.join(CHECKPOINTS_DIR, 'model_best.pth') LAST_MODEL_PATH = os.path.join(CHECKPOINTS_DIR, 'model_last.pth') for directory in (AUGMENTED_IMAGES_DIR, CHECKPOINTS_DIR): os.makedirs(directory) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if model_name == 'inceptionv4': model = ptm.inceptionv4(num_classes=1000, pretrained='imagenet') model.last_linear = nn.Linear(model.last_linear.in_features, n_classes) aug['size'] = 299 aug['mean'] = model.mean aug['std'] = model.std elif model_name == 'resnet152': model = models.resnet152(pretrained=True) model.fc = nn.Linear(model.fc.in_features, n_classes) aug['size'] = 224 aug['mean'] = [0.485, 0.456, 0.406] aug['std'] = [0.229, 0.224, 0.225] elif model_name == 'densenet161': model = models.densenet161(pretrained=True) model.classifier = nn.Linear(model.classifier.in_features, n_classes) aug['size'] = 224 aug['mean'] = [0.485, 0.456, 0.406] aug['std'] = [0.229, 0.224, 0.225] elif model_name == 'senet154': model = ptm.senet154(num_classes=1000, pretrained='imagenet') model.last_linear = nn.Linear(model.last_linear.in_features, n_classes) aug['size'] = model.input_size[1] aug['mean'] = model.mean aug['std'] = model.std model.to(device) augs = Augmentations(**aug) model.aug_params = aug train_ds = CSVDatasetWithName(train_root, train_csv, 'image', 'label', transform=augs.tf_transform, add_extension='.jpg', split=train_split) val_ds = CSVDatasetWithName(val_root, val_csv, 'image', 'label', transform=augs.tf_transform, add_extension='.jpg', split=val_split) datasets = {'train': train_ds, 'val': val_ds} if balanced_loader: data_sampler = sampler.WeightedRandomSampler(train_ds.sampler_weights, len(train_ds)) shuffle = False else: data_sampler = None shuffle = True dataloaders = { 'train': DataLoader(datasets['train'], batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, sampler=data_sampler, worker_init_fn=set_seeds), 'val': DataLoader(datasets['val'], batch_size=batch_size, shuffle=False, num_workers=num_workers, worker_init_fn=set_seeds), } if weighted_loss: criterion = nn.CrossEntropyLoss( weight=torch.Tensor(datasets['train'].class_weights_list).cuda()) else: criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.001) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, min_lr=1e-5, patience=10) metrics = { 'train': pd.DataFrame(columns=['epoch', 'loss', 'acc']), 'val': pd.DataFrame(columns=['epoch', 'loss', 'acc']) } best_val_loss = 1000.0 epochs_without_improvement = 0 batches_per_epoch = None for epoch in range(epochs): print('train epoch {}/{}'.format(epoch + 1, epochs)) epoch_train_result = train_epoch(device, model, dataloaders, criterion, optimizer, 'train', batches_per_epoch) metrics['train'] = metrics['train'].append( { **epoch_train_result, 'epoch': epoch }, ignore_index=True) print('train', epoch_train_result) epoch_val_result = train_epoch(device, model, dataloaders, criterion, optimizer, 'val', batches_per_epoch) metrics['val'] = metrics['val'].append( { **epoch_val_result, 'epoch': epoch }, ignore_index=True) print('val', epoch_val_result) scheduler.step(epoch_val_result['loss']) if epoch_val_result['loss'] < best_val_loss: best_val_loss = epoch_val_result['loss'] epochs_without_improvement = 0 torch.save(model, BEST_MODEL_PATH) print('Best loss at epoch {}'.format(epoch)) else: epochs_without_improvement += 1 print('-' * 40) if epochs_without_improvement > early_stopping_patience: torch.save(model, LAST_MODEL_PATH) break if epoch == (epochs - 1): torch.save(model, LAST_MODEL_PATH) for phase in ['train', 'val']: metrics[phase].epoch = metrics[phase].epoch.astype(int) metrics[phase].to_csv(os.path.join(fs_observer.dir, phase + '.csv'), index=False) print('Best validation loss: {}'.format(best_val_loss)) # TODO: return more metrics return {'max_val_acc': metrics['val']['acc'].max()}
def get_dataloader_train(args, root, image_list): # kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} kwargs = {'num_workers': args.num_workers, 'pin_memory': args.pin_memory} if (args.architecture in ['resnet_i48_18']): dataset = ds.ClassificationDataset( root, image_list, transform=transforms.Compose([ transforms.Resize(60), transforms.RandomRotation(3), transforms.RandomCrop(48), # transforms.RandomResizedCrop(48, scale = (0.72,0.76) , ratio = (1.0,1.0)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.28, )) ]), is_grey=False) elif args.architecture in ['resnet_i24_34']: dataset = ds.ClassificationDataset( root, image_list, transform=transforms.Compose([ transforms.Resize(28), transforms.RandomRotation(3), transforms.RandomCrop(24), # transforms.RandomResizedCrop(48, scale = (0.72,0.76) , ratio = (1.0,1.0)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), is_grey=False) else: dataset = ds.ClassificationDataset( root, image_list, transform=transforms.Compose([ transforms.Resize(256), transforms.RandomRotation(3), # transforms.RandomCrop(224), transforms.RandomResizedCrop(224, scale=(0.74, 0.85), ratio=(1.0, 1.0)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), ) # Balancing the classes !!!!!! prob = np.zeros(args.num_classes) for i in range(len(dataset)): cur_class = dataset.labels[i] prob[cur_class] += 1 prob = 1.0 / prob reciprocal_weights = np.zeros(len(dataset)) epoch_length = args.epoch_length for i in range(len(dataset)): label = dataset.labels[i] reciprocal_weights[i] = prob[label] weights = torch.from_numpy(reciprocal_weights) weighted_sampler = sampler.WeightedRandomSampler(weights, epoch_length) loader = DataLoader(dataset, batch_size=args.batch_size, sampler=weighted_sampler, **kwargs) # loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) return loader
} else: dsets = { x: MyImageFolder(root=args.data_root, idx_file=idx_files[x], transform=data_transforms[x]) for x in data_splits } shuffle_options = {'train': True, 'val': False, 'test': False} if args.weighted_sample: sample_weight = dsets['train'].get_sample_weights() samplers = { 'train': sampler.WeightedRandomSampler(weights=sample_weight, num_samples=len(dsets['train'])), 'val': None, 'test': None } else: # RandomSampler in train phase, SequentialSampler in val and test phase. samplers = {'train': None, 'val': None, 'test': None} dset_loaders = { x: torch.utils.data.DataLoader( dsets[x], batch_size=args.batch_size, shuffle=shuffle_options[x], sampler=samplers[ x], # If specified, the 'shuffle' argument is ignored
def train(arguments): # Parse input arguments json_filename = arguments.config network_debug = arguments.debug # Load options json_opts = json_file_to_pyobj(json_filename) train_opts = json_opts.training # Architecture type arch_type = train_opts.arch_type # Setup Dataset and Augmentation ds_class = get_dataset(arch_type) ds_path = get_dataset_path(arch_type, json_opts.data_path) ds_transform = get_dataset_transformation(arch_type, opts=json_opts.augmentation) # Setup the NN Model model = get_model(json_opts.model) if network_debug: print('# of pars: ', model.get_number_parameters()) print('fp time: {0:.3f} sec\tbp time: {1:.3f} sec per sample'.format( *model.get_fp_bp_time())) exit() # Setup Data Loader num_workers = train_opts.num_workers if hasattr(train_opts, 'num_workers') else 16 train_dataset = ds_class(ds_path, split='train', transform=ds_transform['train'], preload_data=train_opts.preloadData) valid_dataset = ds_class(ds_path, split='val', transform=ds_transform['valid'], preload_data=train_opts.preloadData) test_dataset = ds_class(ds_path, split='test', transform=ds_transform['valid'], preload_data=train_opts.preloadData) # create sampler if train_opts.sampler == 'stratified': print('stratified sampler') train_sampler = StratifiedSampler(train_dataset.labels, train_opts.batchSize) batch_size = 52 elif train_opts.sampler == 'weighted2': print('weighted sampler with background weight={}x'.format( train_opts.bgd_weight_multiplier)) # modify and increase background weight weight = train_dataset.weight bgd_weight = np.min(weight) weight[abs(weight - bgd_weight) < 1e-8] = bgd_weight * train_opts.bgd_weight_multiplier train_sampler = sampler.WeightedRandomSampler( weight, len(train_dataset.weight)) batch_size = train_opts.batchSize else: print('weighted sampler') train_sampler = sampler.WeightedRandomSampler( train_dataset.weight, len(train_dataset.weight)) batch_size = train_opts.batchSize # loader train_loader = DataLoader(dataset=train_dataset, num_workers=num_workers, batch_size=batch_size, sampler=train_sampler) valid_loader = DataLoader(dataset=valid_dataset, num_workers=num_workers, batch_size=train_opts.batchSize, shuffle=True) test_loader = DataLoader(dataset=test_dataset, num_workers=num_workers, batch_size=train_opts.batchSize, shuffle=True) # Visualisation Parameters visualizer = Visualiser(json_opts.visualisation, save_dir=model.save_dir) error_logger = ErrorLogger() # Training Function track_labels = np.arange(len(train_dataset.label_names)) model.set_labels(track_labels) model.set_scheduler(train_opts) if hasattr(model, 'update_state'): model.update_state(0) for epoch in range(model.which_epoch, train_opts.n_epochs): print('(epoch: %d, total # iters: %d)' % (epoch, len(train_loader))) # # # --- Start --- # import matplotlib.pyplot as plt # plt.ion() # plt.figure() # target_arr = np.zeros(14) # # # --- End --- # Training Iterations for epoch_iter, (images, labels) in tqdm(enumerate(train_loader, 1), total=len(train_loader)): # Make a training update model.set_input(images, labels) model.optimize_parameters() if epoch == (train_opts.n_epochs - 1): import time time.sleep(36000) if train_opts.max_it == epoch_iter: break # # # --- visualise distribution --- # for lab in labels.numpy(): # target_arr[lab] += 1 # plt.clf(); plt.bar(train_dataset.label_names, target_arr); plt.pause(0.01) # # # --- End --- # Visualise predictions if epoch_iter <= 100: visuals = model.get_current_visuals() visualizer.display_current_results(visuals, epoch=epoch, save_result=False) # Error visualisation errors = model.get_current_errors() error_logger.update(errors, split='train') # Validation and Testing Iterations pr_lbls = [] gt_lbls = [] for loader, split in zip([valid_loader, test_loader], ['validation', 'test']): model.reset_results() for epoch_iter, (images, labels) in tqdm(enumerate(loader, 1), total=len(loader)): # Make a forward pass with the model model.set_input(images, labels) model.validate() # Visualise predictions visuals = model.get_current_visuals() visualizer.display_current_results(visuals, epoch=epoch, save_result=False) if train_opts.max_it == epoch_iter: break # Error visualisation errors = model.get_accumulated_errors() stats = model.get_classification_stats() error_logger.update({**errors, **stats}, split=split) # HACK save validation error if split == 'validation': valid_err = errors['CE'] # Update the plots for split in ['train', 'validation', 'test']: # exclude bckground #track_labels = np.delete(track_labels, 3) #show_labels = train_dataset.label_names[:3] + train_dataset.label_names[4:] show_labels = train_dataset.label_names visualizer.plot_current_errors(epoch, error_logger.get_errors(split), split_name=split, labels=show_labels) visualizer.print_current_errors(epoch, error_logger.get_errors(split), split_name=split) error_logger.reset() # Save the model parameters if epoch % train_opts.save_epoch_freq == 0: model.save(epoch) if hasattr(model, 'update_state'): model.update_state(epoch) # Update the model learning rate model.update_learning_rate(metric=valid_err, epoch=epoch)
print '#vehicles found: ', data_splitter.num_of_vehicles # Setup training and validation data loaders. flag_data_augmentation = False flag_with_intensities = False hdf5_dataset = HDF5Dataset(data_splitter, 'training', flag_data_augmentation=flag_data_augmentation, flag_with_intensities=flag_with_intensities) hdf5_dataset_val = HDF5Dataset(data_splitter, 'validation', flag_data_augmentation=False, flag_with_intensities=flag_with_intensities) # Weighted sampling: sample pedestrians more than vehicles. sampler = sampler.WeightedRandomSampler(sample_weights, len(sample_weights)) train_loader = DataLoader(hdf5_dataset, batch_size=opt.batchSize, sampler=sampler, shuffle=False) #, num_workers=2) val_loader = DataLoader(hdf5_dataset_val, batch_size=opt.batchSize, shuffle=False) # print(len(dataset), len(test_dataset)) num_classes = 2 # print('classes', num_classes) try: os.makedirs(opt.outf) except OSError:
print("Training set distribution:") print_distribution(ids_train) print("Validation set distribution:") print_distribution(ids_val) classes_train = [get_class(idx.split('/')[-2]) for idx in ids_train] class_weight = class_weight.compute_class_weight('balanced', np.unique(classes_train), classes_train) classes_val = [get_class(idx.split('/')[-2]) for idx in ids_val] weights = [class_weight[i_class] for i_class in classes_train] weights = torch.DoubleTensor(weights) train_sampler = sampler.WeightedRandomSampler(weights, len(weights)) weights = [class_weight[i_class] for i_class in classes_val] weights = torch.DoubleTensor(weights) val_sampler = sampler.WeightedRandomSampler(weights, len(weights)) train_dataset = IEEECameraDataset(ids_train, crop_size=CROP_SIZE, training=True, model=m_names) val_dataset = IEEECameraDataset(ids_val, crop_size=CROP_SIZE, training=False, model=m_names) train_loader = DataLoader(train_dataset,
def build(task, config, task_subset_name): """ Static method returning particular sampler, depending on the name \ provided in the list of parameters & the specified task class. :param task: Instance of an object derived from the Task class. :type task: ``tasks.Task`` :param config: Parameters used to instantiate the sampler. :type config: :py:class:`ptp.configuration.ConfigInterface` :param task_subset_name: Name of task subset (and associated TaskManager object) ..note:: ``config`` should contains the exact (case-sensitive) class name of the sampler to instantiate. .. warning:: ``torch.utils.data.sampler.BatchSampler``, \ ``torch.utils.data.sampler.DistributedSampler`` are not supported yet. .. note:: ``torch.utils.data.sampler.SubsetRandomSampler`` expects 'indices' to index a subset of the dataset. \ Currently, the user can specify these indices using one of the following options: - Option 1: range. >>> indices = range(20) - Option 2: range as str. >>> range_str = '0, 20' - Option 3: list of indices. >>> yaml_list = yaml.load('[0, 2, 5, 10]') - Option 4: name of the file containing indices. >>> filename = "~/data/mnist/training_indices.txt" .. note:: ``torch.utils.data.sampler.WeightedRandomSampler`` expercse additional parameter 'weights'. :return: Instance of a given sampler or ``None`` if the section not present or couldn't build the sampler. """ # Initialize logger. logger = logging.initialize_logger('SamplerFactory') try: # Check presence of the typename attribute. if 'type' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'type'" ) # Get the class typename. typename = config['type'] logger.info( 'Trying to instantiate the {} sampler object'.format(typename)) ########################################################################### # Handle first special case: SubsetRandomSampler. if typename == 'SubsetRandomSampler': # Check presence of the typename attribute. if 'indices' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'indices' " "required by SubsetRandomSampler") # Get and process the indices. indices = config['indices'] # Analyze the type. if type(indices) == str: # Try to open the file. try: # from expanduser()'s doc: If the expansion fails or if the path does not begin # with a tilde, the path is returned unchanged. -> So operation below should be safe. file = open(os.path.expanduser(indices), "r") # Read the file. indices = file.readline() file.close() except Exception: # Ok, this is not a file. pass finally: # Try to process it as a string. # Get the digits. digits = indices.split(',') indices = [int(x) for x in digits] else: # Assume that type(indices) is a list of ints. digits = indices # Finally, we got the list of digits. if len(digits) == 2: # Create a range. indices = range(int(digits[0]), int(digits[1])) # Else: use them as they are, including single index. # Check if indices are within range. if max(indices) >= len(task): raise ConfigurationError( "SubsetRandomSampler cannot work properly when indices are out of range ({}) " "considering that there are {} samples in the task". format(max(indices), len(task))) # Create the sampler object. sampler = pt_samplers.SubsetRandomSampler(indices) ########################################################################### # Handle second special case: WeightedRandomSampler. elif typename == 'WeightedRandomSampler': # Check presence of the attribute. if 'weights' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'weights' " "required by WeightedRandomSampler") # Load weights from file. weights = np.fromfile(os.path.expanduser(config['weights']), dtype=float, count=-1, sep=',') # Create sampler class. sampler = pt_samplers.WeightedRandomSampler(weights, len(task), replacement=True) ########################################################################### # Handle third special case: kFoldRandomSampler. elif typename == 'kFoldRandomSampler': # Check presence of the attribute. if 'folds' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'folds' " "required by kFoldRandomSampler") # Create indices, depending on the fold. folds = config["folds"] if folds < 2: raise ConfigurationError( "kFoldRandomSampler requires at least two 'folds'") # Get epochs per fold (default: 1). epochs_per_fold = config.get("epochs_per_fold", 1) # Create the sampler object. sampler = ptp_samplers.kFoldRandomSampler( len(task), folds, epochs_per_fold, task_subset_name == 'training') ########################################################################### # Handle fourd special case: kFoldWeightedRandomSampler. elif typename == 'kFoldWeightedRandomSampler': # Check presence of the attribute. if 'weights' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'weights' " "required by kFoldWeightedRandomSampler") # Load weights from file. weights = np.fromfile(os.path.expanduser(config['weights']), dtype=float, count=-1, sep=',') # Check presence of the attribute. if 'folds' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'folds' " "required by kFoldWeightedRandomSampler") # Create indices, depending on the fold. folds = config["folds"] if folds < 2: raise ConfigurationError( "kFoldRandomSampler requires at least two 'folds'") # Get epochs per fold (default: 1). epochs_per_fold = config.get("epochs_per_fold", 1) # Create the sampler object. sampler = ptp_samplers.kFoldWeightedRandomSampler( weights, len(task), folds, epochs_per_fold, task_subset_name == 'training') elif typename in ['BatchSampler', 'DistributedSampler']: # Sorry, don't support those. Yet;) raise ConfigurationError( "Sampler Factory currently does not support the '{}' sampler. Please pick one of the others " "or use defaults random sampling".format(typename)) else: # Verify that the specified class is in the samplers package. if typename not in dir(pt_samplers): raise ConfigurationError( "Could not find the specified class '{}' in the samplers package" .format(typename)) # Get the sampler class. sampler_class = getattr(pt_samplers, typename) # Create "regular" sampler. sampler = sampler_class(task) # Return sampler. return sampler except ConfigurationError as e: logger.error(e) # Do not continue with invalid sampler. exit(-1)