def main(): # move up and down train_data_sub_translation = [] for i in range(0, 3000): train_data_sub_translation.extend(map(lambda x: img_translation(trainset_import.train_data[i], x), np.random.randint(1,4,10))) # add noise train_data_sub_noise = [] for i in range(0, 3000): train_data_sub_noise.extend(map(lambda x: add_noise(trainset_import.train_data[i], x), range(0, 10))) # rotate train_data_sub_rotate = [] for i in range(0, 3000): train_data_sub_rotate.extend(rotate_random(trainset_import.train_data[i])) train_data_sub = train_data_sub_rotate + train_data_sub_translation + train_data_sub_noise # label array train_labels_sub = [] for i in range(0, 3000): train_labels_sub.extend([trainset_import.train_labels[i]] * 10) train_labels = train_labels_sub + train_labels_sub + train_labels_sub train_labels = torch.from_numpy(np.array(train_labels)) transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) trainset_new = subMNIST(root='./data/', train=True, download=True, transform=transform, k=90000) trainset_new.train_data = train_data_sub trainset_new.train_labels = train_labels pickle.dump(trainset_new, open("./data/train_labeled_allmethod.p", "wb" ))
def loadDataForLabeling(): train_loader2 = torch.utils.data.DataLoader( trainset_new_unl, batch_size=1, shuffle=False) #Careful must not be shuffled !!!WORKS LOSS IS REDUCED #Add unlabeled test set testAndLabel(1, trainedModel, train_loader2) #Train on full newly labeled set print('Beginning training loops phase 2') #Load new Labels train_labels_sub_unl = torch.from_numpy( np.load("trainset_np_unlLabels.npy")) train_labels_sub_unl = train_labels_sub_unl.type(torch.long) trainset_new_unl.train_labels = train_labels_sub_unl trainset_full = subMNIST(root='./data', train=True, transform=transform, k=50000) trainset_full.train_data = torch.cat( (trainset_new_unl.train_data, trainset_new.train_data), 0) trainset_full.train_labels = torch.cat( (trainset_new_unl.train_labels, trainset_new.train_labels), 0) train_loader3 = torch.utils.data.DataLoader(trainset_full, batch_size=64, shuffle=False) print('Trainset train_data ' + str(trainset_full.train_data.size())) return train_loader3
def join_MNIST_tensors(in1, in2): transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) joint_data = torch.cat((in1.train_data, in2.train_data), 0) joint_labels = torch.cat((in1.train_labels, in2.train_labels), 0) joint_total = subMNIST(root='./data', train=True, download=True, transform=transform, k=in1.train_data.size()[0] + in2.train_data.size()[0]) joint_total.train_data = joint_data.clone() joint_total.train_labels = joint_labels.clone() return joint_total
da = DataAug() augmented_data, augmented_label = da.dataaug(train_data_sub[0], train_labels_sub[0]) for i in range(1, train_data_sub.shape[0]): tdata, tlabel = da.dataaug(train_data_sub[i], train_labels_sub[i]) augmented_data = np.append(augmented_data, tdata, axis=0) augmented_label = np.append(augmented_label, tlabel) train_data_sub = np.append(train_data_sub, augmented_data, axis=0) train_labels_sub = np.append(train_labels_sub, augmented_label, axis=0) augdata = train_data_sub auglabel = train_labels_sub print(augdata.shape) print(auglabel.shape) train_data_sub = torch.from_numpy(augdata) train_labels_sub = torch.from_numpy(auglabel) print(train_labels_sub.size()) print(train_data_sub.size()) trainset_new = subMNIST(root='./data', train=True, download=True, transform=transform, k=18000) trainset_new.train_data = train_data_sub.clone() trainset_new.train_labels = train_labels_sub.clone() pickle.dump(trainset_new, open("data/train_labeled_aug.p", "wb"))
def dataLoader(): transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) trainset_original = datasets.MNIST('../data', train=True, download=True, transform=transform) train_label_index = [] valid_label_index = [] for i in range(10): train_label_list = trainset_original.train_labels.numpy() label_index = np.where(train_label_list == i)[0] label_subindex = list(label_index[:300]) valid_subindex = list(label_index[300:1000 + 300]) train_label_index += label_subindex valid_label_index += valid_subindex #Train Set trainset_np = trainset_original.train_data.numpy() trainset_label_np = trainset_original.train_labels.numpy() train_data_sub = torch.from_numpy(trainset_np[train_label_index]) train_labels_sub = torch.from_numpy(trainset_label_np[train_label_index]) trainset_new = subMNIST(root='./data', train=True, download=True, transform=transform, k=3000) trainset_new.train_data = train_data_sub.clone() trainset_new.train_labels = train_labels_sub.clone() pickle.dump(trainset_new, open("train_labeled.p", "wb")) #### Augmenting training set ## initialize trainset as usual trainset_aug = subMNIST(root='./data', train=True, download=True, transform=transform, k=30000) ## turns out you can just repeat a tensor, cool ## http://pytorch.org/docs/tensors.html#torch.Tensor.repeat trainset_aug.train_data = train_data_sub.clone().repeat( 10, 1, 1) ## 4 in the first axis print(train_data_sub.size()) print(trainset_aug.train_data.size()) trainset_aug.train_labels = train_labels_sub.clone().repeat( 10) ## only one axi print(train_labels_sub.size()) print(trainset_aug.train_labels.size()) ## dims look correct! ## load scipy image tools and distributions for sampling from scipy import ndimage from scipy import stats import random random.seed(1337) #from math import ceil, floor ## iterate through and augment n = trainset_aug.train_data.size()[0] / 10 print(n) iter_vals = range(0, n) #iter_vals = range(0, 5) ''' for i in iter_vals: image_in = trainset_aug.train_data[i] trainset_aug.train_data[i+n] = zoom_image(image_in) trainset_aug.train_data[i+(n*2)] = translate_image(image_in) trainset_aug.train_data[i+(n*3)] = rotate_image(image_in) ''' for i in iter_vals: image_in = trainset_aug.train_data[i] trainset_aug.train_data[i + n] = zoom_image(image_in) trainset_aug.train_data[i + (n * 2)] = zoom_image(image_in) trainset_aug.train_data[i + (n * 3)] = zoom_image(image_in) trainset_aug.train_data[i + (n * 4)] = translate_image(image_in) trainset_aug.train_data[i + (n * 5)] = translate_image(image_in) trainset_aug.train_data[i + (n * 6)] = translate_image(image_in) trainset_aug.train_data[i + (n * 7)] = rotate_image(image_in) trainset_aug.train_data[i + (n * 8)] = rotate_image(image_in) trainset_aug.train_data[i + (n * 9)] = rotate_image(image_in) ## dump to pickle pickle.dump(trainset_aug, open("train_labeled_aug.p", "wb")) #Validation Set validset_np = trainset_original.train_data.numpy() validset_label_np = trainset_original.train_labels.numpy() valid_data_sub = torch.from_numpy(validset_np[valid_label_index]) valid_labels_sub = torch.from_numpy(validset_label_np[valid_label_index]) validset = subMNIST(root='./data', train=False, download=True, transform=transform, k=10000) validset.test_data = valid_data_sub.clone() validset.test_labels = valid_labels_sub.clone() pickle.dump(validset, open("validation.p", "wb")) #Unlabeled Data train_unlabel_index = [] for i in range(60000): if i in train_label_index or i in valid_label_index: pass else: train_unlabel_index.append(i) trainset_np = trainset_original.train_data.numpy() trainset_label_np = trainset_original.train_labels.numpy() train_data_sub_unl = torch.from_numpy(trainset_np[train_unlabel_index]) #train_labels_sub_unl = torch.from_numpy(trainset_label_np[train_unlabel_index]) temp = np.empty(47000) temp.fill(-1) train_labels_sub_unl = torch.from_numpy(temp) trainset_new_unl = subMNIST(root='./data', train=True, download=True, transform=transform, k=47000) trainset_new_unl.train_data = train_data_sub_unl.clone() trainset_new_unl.train_labels = train_labels_sub_unl.clone() pickle.dump(trainset_new_unl, open("train_unlabeled.p", "wb"))
def split_dataset(trainset_original, n_train_labels_pc=10, n_train_unlabeled_pc=None, n_validation_pc=1000): """ Parameters ---------- trainset_original : torch.utils.data.Dataset A dataset object as defined by torch n_train_labels_pc : int Number of labeled samples per class to use for training. n_train_unlabeled_pc : int Number of unlabeled samples per class to use for training. n_validation_pc : int Number of labeled samples per class to use for validation. Returns ------- trainset_new, validset, trainset_new_unl: torch.utils.data.Dataset objects """ train_label_index = [] train_unlabel_index = [] valid_label_index = [] classes = np.unique(trainset_original.train_labels.numpy()) n_classes = len(classes) for i in range(n_classes): train_label_list = trainset_original.train_labels.numpy() label_index = np.where(train_label_list == i)[0] n_class_samples = len(label_index) n_tv = n_train_labels_pc + n_validation_pc if n_train_unlabeled_pc is not None: n_tv += n_train_unlabeled_pc if n_tv > n_class_samples: raise ValueError('Class {} has not enough samples ({}) to split ' 'in training labeled, training unlabeled and ' 'validation set'.format(classes[i], n_class_samples)) label_subindex = list(label_index[:n_train_labels_pc]) ind_end = n_train_labels_pc + n_validation_pc valid_subindex = list(label_index[n_train_labels_pc:ind_end]) ind_start = ind_end if n_train_unlabeled_pc is not None: ind_end += n_train_labels_pc else: ind_end = n_class_samples unlabel_subindex = list(label_index[ind_start:ind_end]) train_label_index += label_subindex valid_label_index += valid_subindex train_unlabel_index += unlabel_subindex trainset_np = trainset_original.train_data.numpy() trainset_label_np = trainset_original.train_labels.numpy() train_data_sub = torch.from_numpy(trainset_np[train_label_index]) train_labels_sub = torch.from_numpy(trainset_label_np[train_label_index]) trainset_new = subMNIST(root='./../data', train=True, download=True, transform=mnist_transform, k=n_train_labels_pc * n_classes) trainset_new.train_data = train_data_sub.clone() trainset_new.train_labels = train_labels_sub.clone() # pickle.dump(trainset_new, open("./../data/train_labeled.p", "wb")) validset_np = trainset_original.train_data.numpy() validset_label_np = trainset_original.train_labels.numpy() valid_data_sub = torch.from_numpy(validset_np[valid_label_index]) valid_labels_sub = torch.from_numpy(validset_label_np[valid_label_index]) validset = subMNIST(root='./../data', train=False, download=True, transform=mnist_transform, k=n_validation_pc * n_classes) validset.test_data = valid_data_sub.clone() validset.test_labels = valid_labels_sub.clone() # pickle.dump(validset, open("./../data/validation.p", "wb")) n_unlabeled_set = len(train_unlabel_index) trainset_np = trainset_original.train_data.numpy() trainset_label_np = trainset_original.train_labels.numpy() train_data_sub_unl = torch.from_numpy(trainset_np[train_unlabel_index]) # train_labels_sub_unl = torch.from_numpy(trainset_label_np[train_unlabel_index]) trainset_new_unl = subMNIST(root='./../data', train=True, download=True, transform=mnist_transform, k=n_unlabeled_set) trainset_new_unl.train_data = train_data_sub_unl.clone() trainset_new_unl.train_labels = None # Unlabeled # pickle.dump(trainset_new_unl, open("./../data/train_unlabeled.p", "wb")) return trainset_new, trainset_new_unl, validset
train_label_index = [] valid_label_index = [] for i in range(10): train_label_list = trainset_original.train_labels.numpy() label_index = np.where(train_label_list == i)[0] label_subindex = list(label_index[:300]) valid_subindex = list(label_index[300: 1000 + 300]) train_label_index += label_subindex valid_label_index += valid_subindex trainset_np = trainset_original.train_data.numpy() trainset_label_np = trainset_original.train_labels.numpy() train_data_sub = torch.from_numpy(trainset_np[train_label_index]) train_labels_sub = torch.from_numpy(trainset_label_np[train_label_index]) trainset_new = subMNIST(root='data', train=True, download=True, transform=transform, k=3000) trainset_new.data = train_data_sub.clone() trainset_new.targets = train_labels_sub.clone() pickle.dump(trainset_new, open("data/train_labeled.p", "wb")) validset_np = trainset_original.train_data.numpy() validset_label_np = trainset_original.train_labels.numpy() valid_data_sub = torch.from_numpy(validset_np[valid_label_index]) valid_labels_sub = torch.from_numpy(validset_label_np[valid_label_index]) validset = subMNIST(root='data', train=False, download=True, transform=transform, k=10000) validset.data = valid_data_sub.clone() validset.targets = valid_labels_sub.clone()
def all_transformations(dataset_pickle, data_type, type_transformation=['rotation'], value_rotation=45.0, value_scale=0.8, distance_translation=0.1, direction_translation='right', horizontal_translation=True, elastic_alpha=34, elastic_sigma=4): ''' This function takes the pickle data (train and test) and pre-process with 3 different data-augmentation techniques. We can change default values of each transformation. INPUT: dataset_pickle (pickle pytorch), data_type ='train' or 'train_unlabel' or 'test', type_transformation=['rotation'], ['scale'], ['translation'], combinations (i.e. ['rotation','scale']) or ['all'], RETURNS: Transformed data, ready to load with torch.utils.data.DataLoader ''' transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) if data_type == 'train': dataset_loader_numpy = dataset_pickle.train_data.numpy() dataset_label_numpy = dataset_pickle.train_labels.numpy() dataset_import = subMNIST(root='./data', train=True, transform=transform, download=True, k=dataset_pickle.train_data.size()[0]) if data_type == 'train_unlabel': dataset_loader_numpy = dataset_pickle.train_data.numpy() dataset_label_numpy = dataset_pickle.train_labels.numpy() dataset_import = subMNIST(root='./data', train=True, transform=transform, download=True, k=dataset_pickle.train_data.size()[0]) if type_transformation == ['rotation']: dataset_loader_numpy_transformed = np.array( [rotation(x, value_rotation) for x in dataset_loader_numpy]) if type_transformation == ['elastic']: dataset_loader_numpy_transformed = np.array([ elastic_transform(x, elastic_alpha, elastic_sigma) for x in dataset_loader_numpy ]) if type_transformation == ['scale']: dataset_loader_numpy_transformed = np.array( [scale(x, value_scale) for x in dataset_loader_numpy]) if type_transformation == ['translation']: dataset_loader_numpy_transformed = np.array([ translation(x, distance_translation, direction=direction_translation, horizontal=horizontal_translation) for x in dataset_loader_numpy ]) if type_transformation == ['rotation', 'scale'] or type_transformation == [ 'scale', 'rotation' ]: dataset_loader_numpy_transformed = np.array( [rotation(x, value_rotation) for x in dataset_loader_numpy]) dataset_loader_numpy_transformed = np.array( [scale(x, value_scale) for x in dataset_loader_numpy_transformed]) if type_transformation == [ 'rotation', 'translation' ] or type_transformation == ['translation', 'rotation']: dataset_loader_numpy_transformed = np.array( [rotation(x, value_rotation) for x in dataset_loader_numpy]) dataset_loader_numpy_transformed = np.array([ translation(x, distance_translation, direction=direction_translation, horizontal=horizontal_translation) for x in dataset_loader_numpy_transformed ]) if type_transformation == [ 'scale', 'translation' ] or type_transformation == ['translation', 'scale']: dataset_loader_numpy_transformed = np.array( [scale(x, value_scale) for x in dataset_loader_numpy]) dataset_loader_numpy_transformed = np.array([ translation(x, distance_translation, direction=direction_translation, horizontal=horizontal_translation) for x in dataset_loader_numpy_transformed ]) if type_transformation == ['rotation', 'scale', 'translation', 'elastic']: dataset_loader_numpy_transformed = np.array( [rotation(x, value_rotation) for x in dataset_loader_numpy]) dataset_loader_numpy_transformed = np.array( [scale(x, value_scale) for x in dataset_loader_numpy_transformed]) dataset_loader_numpy_transformed = np.array([ translation(x, distance_translation, direction=direction_translation, horizontal=horizontal_translation) for x in dataset_loader_numpy_transformed ]) dataset_loader_numpy_transformed = np.array([ elastic_transform(x, elastic_alpha, elastic_sigma) for x in dataset_loader_numpy_transformed ]) if type_transformation == ['rotation', 'elastic']: dataset_loader_numpy_transformed = np.array( [rotation(x, value_rotation) for x in dataset_loader_numpy]) dataset_loader_numpy_transformed = np.array([ elastic_transform(x, elastic_alpha, elastic_sigma) for x in dataset_loader_numpy_transformed ]) if type_transformation == ['all']: dataset_loader_numpy_transformed = np.array( [rotation(x, value_rotation) for x in dataset_loader_numpy]) dataset_loader_numpy_transformed = np.array( [scale(x, value_scale) for x in dataset_loader_numpy_transformed]) dataset_loader_numpy_transformed = np.array([ translation(x, distance_translation, direction=direction_translation, horizontal=horizontal_translation) for x in dataset_loader_numpy_transformed ]) dataset_loader_preprocessed = torch.from_numpy( dataset_loader_numpy_transformed) dataset_loader_preprocessed2 = dataset_loader_preprocessed.type( torch.ByteTensor) if data_type == 'train': print("TRAIN TYPE") dataset_import.train_data = dataset_loader_preprocessed2.clone() dataset_import.train_labels = torch.from_numpy( dataset_label_numpy).clone() if data_type == 'train_unlabel': dataset_import.train_data = dataset_loader_preprocessed2.clone() dataset_import.train_labels = torch.from_numpy( np.repeat(-1, dataset_import.train_data.size()[0])).clone() return dataset_import