def get_single_task(dataroot, task): tf = transforms.ToTensor() if task.startswith('EMNIST'): split = task.split('/', maxsplit=2)[1] dataroot = join(dataroot, 'emnist') tf_target = (lambda x: x - 1) if split == 'letters' else None output_size = 26 if split == 'letters' else 10 trainset = EMNIST(dataroot, split=split, train=True, transform=tf, target_transform=tf_target) trainset = stratified_subset(trainset, trainset.targets.tolist(), 500) testset = EMNIST(dataroot, split=split, train=False, transform=tf, target_transform=tf_target) elif task == 'KMNIST': dataroot = join(dataroot, 'kmnist') output_size = 10 trainset = KMNIST(dataroot, train=True, transform=tf) trainset = stratified_subset(trainset, trainset.targets.tolist(), 500) testset = KMNIST(dataroot, train=False, transform=tf) else: raise ValueError(task) return trainset, testset, output_size
def load_kmnist_data(): transforms = Compose([ ConvertImageDtype(float32), Normalize(zeros(28, 28) + 0.5, zeros(28, 28) + 0.5), ]) train_set = KMNIST(root="./data/", download=True, train=True, transform=transforms) test_set = KMNIST(root="./data/", train=False, download=True, transform=transforms) train_data = transforms(train_set.data) test_data = transforms(test_set.data) return ( train_data.unsqueeze(1), test_data.unsqueeze(1), train_set.targets, test_set.targets, train_set.classes, )
def kmnist(root): from torchvision.datasets import KMNIST transform = transforms.Compose([ lambda x: x.convert("RGB"), transforms.Resize(224), transforms.ToTensor(), ]) trainset = KMNIST(root, train=True, transform=transform, download=True) testset = KMNIST(root, train=False, transform=transform) return trainset, testset
def create(cls, args): trainset = KMNIST(args.dataset_root, train=True, transform=cls.get_transforms(args, True), download=True) testset = KMNIST(args.dataset_root, train=False, transform=cls.get_transforms(args, False), download=True) train, valid = _split(args, trainset) return train, valid, testset
def get_loaders_kmnist(train_translation_rotation_list, test_translation_rotation_list, batch_size): """Load kmnist dataset. The data is divided by 255 and subracted by mean and divided by standard deviation. """ train_loaders_desc = [] test_loaders_desc = [] for (translation, rotation) in train_translation_rotation_list: train_desc = 'train_' + str(translation[0]) + '_' + str( translation[1]) + '_' + str( rotation) + '_' + 'KMNIST' #for identifying in logs train_transform = transforms.Compose([ #transforms.Resize(31), transforms.RandomAffine(rotation, translation, (0.9, 1.1)), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )), ]) training_set = KMNIST(DATAPATH, train=True, download=True, transform=train_transform) training_data_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True) train_loaders_desc.append((training_data_loader, train_desc)) for (translation, rotation) in test_translation_rotation_list: test_desc = 'test_' + str(translation[0]) + '_' + str( translation[1]) + '_' + str( rotation) + '_' + 'KMNIST' #for identifying in logs test_transform = transforms.Compose([ #transforms.Resize(31), transforms.RandomAffine(rotation, translation), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )), ]) testing_set = KMNIST(DATAPATH, train=False, download=True, transform=test_transform) testing_data_loader = DataLoader(testing_set, batch_size=batch_size, shuffle=True) test_loaders_desc.append((testing_data_loader, test_desc)) return train_loaders_desc, test_loaders_desc
def get_kmnist_loaders(data_dir, b_sz, shuffle=True): """Helper function that deserializes KMNIST data and returns the relevant data loaders. params: data_dir: string - root directory where the data will be saved b_sz: integer - the batch size shuffle: boolean - whether to shuffle the training set or not """ train_loader = DataLoader(KMNIST(data_dir, transform=ToTensor(), download=True), shuffle=shuffle, batch_size=b_sz) test_loader = DataLoader(KMNIST(data_dir, train=False, transform=ToTensor(), download=True), shuffle=False, batch_size=b_sz) return train_loader, test_loader
def kmnist(root: str, batch_size: int = 32, workers: int = 6, splits: Union[str, Tuple[str]] = ('train', 'val')) -> LoaderTypes: """Wrapper for loading the `KMNIST` dataset. Args: root: The root directory where the dataset is stored. Usually ~/.torch/datasets. batch_size: The batch size. workers: The number of CPUs to use for when loading the data from disk. splits: Which splits of the data to return. Possible values are `train`, `val` and `test`. Returns: A list data loaders of the chosen splits. """ loader_list = list() if 'train' in splits or 'val' in splits: train_val_set = KMNIST(root, train=True, download=True, transform=ToTensor()) val_set, train_set = torch.utils.data.random_split(train_val_set, [10000, len(train_val_set) - 10000]) if 'train' in splits: train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) loader_list.append(train_loader) if 'val' in splits: val_set = Memory(val_set, img_size=28, channels=1) for _ in val_set: pass val_set.set_use_cache(True) val_set.pin_memory() loader_list.append(val_set) if 'test' in splits: test_set = KMNIST(root, train=False, download=True, transform=ToTensor()) test_set = Memory(test_set, img_size=28, channels=1) for _ in test_set: pass test_set.set_use_cache(True) test_set.pin_memory() loader_list.append(test_set) if len(loader_list) == 1: return loader_list[0] return loader_list
def _get_mnist(self, train, transform=None, download=False): return KMNIST(self.data_dir, train=train, transform=transform, download=download)
train[0] image,label = train[1] image from PIL import Image new_iamge = image.resize((200,200), Image.ANTIALIAS) new_iamge label from torchvision.datasets import KMNIST train = KMNIST('./kmnist_folder', train=True, download=True) train train[0] image,label = train[0] image from PIL import Image new_iamge = image.resize((200,200), Image.ANTIALIAS) new_iamge import torchvision.datasets as dsets import torchvision.transforms as transforms
from torchvision.datasets import KMNIST from torch.utils.data import TensorDataset from data.data_helpers import split_dataset, stratified_split_dataset import properties as prop import pwd, os from data.data_helpers import split_dataset, concat_datasets DATA_PATH = pwd.getpwuid(os.getuid()).pw_dir + '/time_series_data/kMNIST' def transform_data(data): data = data.unsqueeze(1).float().div(255) return data train_dataset = KMNIST(DATA_PATH, train=True, download=True) trainX, trainy = transform_data(train_dataset.data), train_dataset.targets #validation_dataset, train_dataset = split_dataset(TensorDataset(trainX, trainy), prop.VAL_SIZE) #trainX, trainy = train_dataset.tensors[0], train_dataset.tensors[1] train_dataset = TensorDataset(trainX, trainy) # train_size = 2000 # train_dataset, _ = split_dataset(train_dataset, train_size) ################ test dataset ################################ test_dataset = KMNIST(DATA_PATH, train=False, download=True) testX, testy = transform_data(test_dataset.data), test_dataset.targets test_dataset = TensorDataset(testX, testy)
def select_dataset(dataset_name, input_dim=2, n_samples=10000): """ :params n_samples: number of points returned. If 0, all datapoints will be returned. For artificial data, it will throw an error. """ if dataset_name == 'fmnist': f_mnist = FashionMNIST(root="./datasets", download=True) data = f_mnist.data.numpy() vec_data = np.reshape(data, (data.shape[0], -1)) vec_data = np.float32(vec_data) labels = f_mnist.targets.numpy() elif dataset_name == 'emnist': f_mnist = EMNIST(root="./datasets", download=True, split='byclass') data = f_mnist.data.numpy() vec_data = np.reshape(data, (data.shape[0], -1)) vec_data = np.float32(vec_data) labels = f_mnist.targets.numpy() elif dataset_name == 'kmnist': f_mnist = KMNIST(root="./datasets", download=True) data = f_mnist.data.numpy() vec_data = np.reshape(data, (data.shape[0], -1)) vec_data = np.float32(vec_data) labels = f_mnist.targets.numpy() elif dataset_name == 'usps': f_mnist = USPS(root="./datasets", download=True) data = f_mnist.data vec_data = np.reshape(data, (data.shape[0], -1)) vec_data = np.float32(vec_data) labels = np.float32(f_mnist.targets) elif dataset_name == 'news': newsgroups_train = fetch_20newsgroups(data_home='./datasets', subset='train', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() vec_data = vectorizer.fit_transform(newsgroups_train.data).toarray() vec_data = np.float32(vec_data) labels = newsgroups_train.target labels = np.float32(labels) elif dataset_name == 'cover_type': file_name = file_path + "/datasets/covtype.data" train_data = np.array(pd.read_csv(file_name, sep=',')) vec_data = np.float32(train_data[:, :-1]) labels = np.float32(train_data[:, -1]) elif dataset_name == 'char': digits = datasets.load_digits() n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) vec_data = np.float32(data) labels = digits.target elif dataset_name == 'charx': file_name = file_path + "/datasets/char_x.npy" data = np.load(file_name, allow_pickle=True) vec_data, labels = data[0], data[1] elif dataset_name == 'kdd_cup': cover_train = fetch_kddcup99(data_home='./datasets', download_if_missing=True) vec_data = cover_train.data string_labels = cover_train.target vec_data, labels = feature_tranformers.vectorizer_kdd(data=vec_data, labels=string_labels) elif dataset_name == 'aggregation': file_name = file_path + "/2d_data/Aggregation.csv" a = np.array(pd.read_csv(file_name, sep=';')) vec_data = a[:, 0:2] labels = a[:, 2] elif dataset_name == 'compound': file_name = file_path + "/2d_data/Compound.txt" a = np.array(pd.read_csv(file_name, sep='\t')) vec_data = a[:, 0:2] labels = a[:, 2] elif dataset_name == 'd31': file_name = file_path + "/2d_data/D31.txt" a = np.array(pd.read_csv(file_name, sep='\t')) vec_data = a[:, 0:2] labels = a[:, 2] elif dataset_name == 'flame': file_name = file_path + "/2d_data/flame.txt" a = np.array(pd.read_csv(file_name, sep='\t')) vec_data = a[:, 0:2] labels = a[:, 2] elif dataset_name == 'path_based': file_name = file_path + "/2d_data/pathbased.txt" a = np.array(pd.read_csv(file_name, sep='\t')) vec_data = a[:, 0:2] labels = a[:, 2] elif dataset_name == 'r15': file_name = file_path + "/2d_data/R15.txt" a = np.array(pd.read_csv(file_name, sep='\t')) vec_data = a[:, 0:2] labels = a[:, 2] elif dataset_name == 'spiral': file_name = file_path + "/2d_data/spiral.txt" a = np.array(pd.read_csv(file_name, sep='\t')) vec_data = a[:, 0:2] labels = a[:, 2] elif dataset_name == 'birch1': file_name = file_path + "/2d_data/birch1.txt" a = np.array(pd.read_csv(file_name, delimiter=r"\s+")) vec_data = a[:, :] labels = np.ones((vec_data.shape[0])) elif dataset_name == 'birch2': file_name = file_path + "/2d_data/birch2.txt" a = np.array(pd.read_csv(file_name, delimiter=r"\s+")) vec_data = a[:, :] labels = np.ones((vec_data.shape[0])) elif dataset_name == 'birch3': file_name = file_path + "/2d_data/birch3.txt" a = np.array(pd.read_csv(file_name, delimiter=r"\s+")) vec_data = a[:, :] labels = np.ones((vec_data.shape[0])) elif dataset_name == 'worms': file_name = file_path + "/2d_data/worms/worms_2d.txt" a = np.array(pd.read_csv(file_name, sep=' ')) vec_data = a[:, :] labels = np.ones((vec_data.shape[0])) elif dataset_name == 't48k': file_name = file_path + "/2d_data/t4.8k.txt" a = np.array(pd.read_csv(file_name, sep=' ')) vec_data = a[1:, :] labels = np.ones((vec_data.shape[0])) elif dataset_name == 'moons': data, labels = make_moons(n_samples=5000) vec_data = np.float32(data) labels = np.float32(labels) elif dataset_name == 'circles': data, labels = make_circles(n_samples=5000) vec_data = np.float32(data) labels = np.float32(labels) elif dataset_name == 'blobs': data, labels = make_blobs(n_samples=n_samples, centers=3) vec_data = np.float32(data) labels = np.float32(labels) elif dataset_name == 'gmm': mean_1 = np.zeros(input_dim) mean_2 = 100 * np.ones(input_dim) cov = np.eye(input_dim) data_1 = np.random.multivariate_normal(mean_1, cov, int(n_samples / 2)) labels_1 = np.ones(int(n_samples / 2)) labels_2 = 2 * np.ones(int(n_samples / 2)) data_2 = np.random.multivariate_normal(mean_2, cov, int(n_samples / 2)) vec_data = np.concatenate([data_1, data_2], axis=0) labels = np.concatenate([labels_1, labels_2], axis=0) elif dataset_name == 'uniform': vec_data = np.random.uniform(0, 1, size=(n_samples, input_dim)) * 10 labels = np.ones(n_samples) elif dataset_name == 'mnist_pc': d_mnist = MNIST(root="./datasets", download=True) mnist = d_mnist.data.numpy() data = np.float32(np.reshape(mnist, (mnist.shape[0], -1))) pca_data = PCA(n_components=input_dim).fit_transform(data) n_indices = np.random.randint(pca_data.shape[0], size=n_samples) vec_data = pca_data[n_indices] labels = d_mnist.targets.numpy()[n_indices] elif dataset_name == 'usps_pc': d_mnist = USPS(root="./datasets", download=True) mnist = d_mnist.data data = np.float32(np.reshape(mnist, (mnist.shape[0], -1))) pca_data = PCA(n_components=input_dim).fit_transform(data) n_indices = np.random.randint(pca_data.shape[0], size=n_samples) vec_data = pca_data[n_indices] labels = np.float32(d_mnist.targets) elif dataset_name == 'char_pc': digits = datasets.load_digits() n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) data = np.float32(data) targets = digits.target pca_data = PCA(n_components=input_dim).fit_transform(data) n_indices = np.random.randint(pca_data.shape[0], size=n_samples) vec_data = pca_data[n_indices] labels = targets else: d_mnist = MNIST(root="./datasets", download=True) data = d_mnist.data.numpy() vec_data = np.reshape(data, (data.shape[0], -1)) vec_data = np.float32(vec_data) labels = d_mnist.targets.numpy() if 0 < n_samples < vec_data.shape[0]: rand_indices = np.random.choice(vec_data.shape[0], size=(n_samples,), replace=False) return vec_data[rand_indices], labels[rand_indices] else: return vec_data, labels
]), ) fmnist_test_dataset = FashionMNIST( DATA_DIR, train=False, transform=transforms.Compose([ transforms.Pad(2), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )), ]), ) kmnist_train_dataset = KMNIST( DATA_DIR, train=True, download=True, transform=transforms.Compose([ transforms.Pad(2), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )), ]), ) kmnist_test_dataset = KMNIST( DATA_DIR, train=False, download=True, transform=transforms.Compose([ transforms.Pad(2), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )), ]), )
# 学習用モデルのインスタンスを生成します model = MLP() # ----------------------------------------------------------------------------- # 学習データの準備をします # print('---------- 学習のデータの準備 ----------') data_folder = '~/data' transform = transforms.Compose([ # データの型をTensorに変換する transforms.ToTensor() ]) # 学習データ train_data_with_labels = KMNIST(data_folder, train=True, download=True, transform=transform) train_data_loader = DataLoader(train_data_with_labels, batch_size=BATCH_SIZE, shuffle=True) # 検証データ test_data_with_labels = KMNIST(data_folder, train=False, download=True, transform=transform) test_data_loader = DataLoader(test_data_with_labels, batch_size=BATCH_SIZE, shuffle=True)