def read_data_ml100k(): data_dir = d2l.download_extract('ml-100k') names = ['user_id', 'item_id', 'rating', 'timestamp'] data = pd.read_csv(os.path.join(data_dir, 'u.data'), '\t', names=names, engine='python') num_users = data.user_id.unique().shape[0] num_items = data.item_id.unique().shape[0] return data, num_users, num_items
def load_data_wiki(batch_size, max_len): num_workers = d2l.get_dataloader_workers() data_dir = d2l.download_extract('wikitext-2', 'wikitext-2') paragraphs = _read_wiki(data_dir) train_set = _WikiTextDataset(paragraphs, max_len) train_iter = gluon.data.DataLoader(train_set, batch_size, shuffle=True, num_workers=num_workers) return train_iter, train_set.vocab
def read_data_ml100k(dir = None): data_dir = d2l.download_extract('ml-100k') names = ['user_id', 'item_id', 'rating', 'timestamp'] data = pd.read_csv(os.path.join(data_dir, 'u.data'), '\t', names = names, engine = 'python') num_users = data.user_id.unique().shape[0] num_items = data.item_id.unique().shape[0] if dir is not None: if not os.path.exists(dir): # check if the specified folder exists os.makedirs(dir) # if not, create it data.to_csv(dir + 'MovieLens.csv') return data, num_users, num_items
def read_data_ml25m(data_path,download=False): if download==True: data_dir = d2l.download_extract('ml-25m.zip') else: data_dir = data_path names = ['user_id', 'item_id', 'rating', 'timestamp'] data = pd.read_csv(os.path.join(data_dir, 'ratings.csv'), ',', names=names, engine='python') num_users = data.user_id.unique().shape[0] num_items = data.item_id.unique().shape[0] return data, num_users, num_items
def read_data_bananas(is_train=True): """Read the bananas dataset images and labels.""" data_dir = d2l.download_extract('banana-detection') csv_fname = os.path.join(data_dir, 'bananas_train' if is_train else 'bananas_val', 'label.csv') csv_data = pd.read_csv(csv_fname) csv_data = csv_data.set_index('img_name') images, targets = [], [] for img_name, target in csv_data.iterrows(): images.append( image.imread( os.path.join(data_dir, 'bananas_train' if is_train else 'bananas_val', 'images', f'{img_name}'))) # Since all images have same object class i.e. category '0', # the `label` column corresponds to the only object i.e. banana # The target is as follows : (`label`, `xmin`, `ymin`, `xmax`, `ymax`) targets.append(list(target)) return images, np.expand_dims(np.array(targets), axis=1) / 256
self.embedding = nn.Embedding(num_inputs, num_factors) self.fc = nn.Embedding(num_inputs, 1) self.linear_layer = nn.Dense(1, use_bias=True) def forward(self, x): square_of_sum = np.sum(self.embedding(x), axis=1)**2 sum_of_square = np.sum(self.embedding(x)**2, axis=1) x = self.linear_layer(self.fc(x).sum(1)) \ + 0.5 * (square_of_sum - sum_of_square).sum(1, keepdims=True) x = npx.sigmoid(x) return x # %% batch_size = 2048 data_dir = d2l.download_extract('ctr') train_data = d2l.CTRDataset(os.path.join(data_dir, 'train.csv')) test_data = d2l.CTRDataset(os.path.join(data_dir, 'test.csv'), feat_mapper=train_data.feat_mapper, defaults=train_data.defaults) train_iter = gluon.data.DataLoader( train_data, shuffle=True, last_batch='rollover', batch_size=batch_size, # num_workers=d2l.get_dataloader_workers()) num_workers=1) test_iter = gluon.data.DataLoader( test_data, shuffle=False, last_batch='rollover',
net.collect_params().reset_ctx(devices) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': lr, 'wd': wd }) d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices) def predict(img): X = test_iter._dataset.normalize_image(img) X = np.expand_dims(X.transpose(2, 0, 1), axis=0) pred = net(X.as_in_ctx(devices[0])).argmax(axis=1) return pred.reshape(pred.shape[1], pred.shape[2]) def label2image(pred): colormap = np.array(d2l.VOC_COLORMAP, ctx=devices[0], dtype='uint8') X = pred.astype('int32') return colormap[X, :] voc_dir = d2l.download_extract('voc2012', 'VOCdevkit/VOC2012') test_images, test_labels = d2l.read_voc_images(voc_dir, False) n, imgs = 4, [] for i in range(n): crop_rect = (0, 0, 480, 320) X = image.fixed_crop(test_images[i], *crop_rect) pred = label2image(predict(X)) imgs += [X, pred, image.fixed_crop(test_labels[i], *crop_rect)] d2l.show_images(imgs[::3] + imgs[1::3] + imgs[2::3], 3, n, scale=2)
def read_data_ml100k(): d2l.download_extract('ml-100k')
You may have noticed that the above structure is quite similar to that of the CIFAR-10 competition in :numref:`sec_kaggle_cifar10`, where folders `train/` and `test/` contain training and testing dog images respectively, and `labels.csv` has the labels for the training images. Similarly, to make it easier to get started, we provide a small-scale sample of the dataset mentioned above, "train_valid_test_tiny.zip". If you are going to use the full dataset for the Kaggle competition, you will also need to change the `demo` variable below to `False`. """ #@save d2l.DATA_HUB['dog_tiny'] = (d2l.DATA_URL + 'kaggle_dog_tiny.zip', '0cb91d09b814ecdc07b50f31f8dcad3e81d6a86d') # If you use the full dataset downloaded for the Kaggle competition, change # the variable below to False demo = True if demo: data_dir = d2l.download_extract('dog_tiny') else: data_dir = os.path.join('..', 'data', 'dog-breed-identification') """### Organizing the Dataset We can organize the dataset similarly to what we did in :numref:`sec_kaggle_cifar10`, namely separating a validation set from the training set, and moving images into subfolders grouped by labels. The `reorg_dog_data` function below is used to read the training data labels, segment the validation set, and organize the training set. """ def reorg_dog_data(data_dir, valid_ratio): labels = d2l.read_csv_labels(os.path.join(data_dir, 'labels.csv')) d2l.reorg_train_valid(data_dir, labels, valid_ratio) d2l.reorg_test(data_dir)
import pandas as pd import shutil import time npx.set_np() #@save d2l.DATA_HUB['cifar10_tiny'] = (d2l.DATA_URL + 'kaggle_cifar10_tiny.zip', '2068874e4b9a9f0fb07ebe0ad2b29754449ccacd') # If you use the full dataset downloaded for the Kaggle competition, set # `demo` to False demo = False if demo: data_dir = d2l.download_extract('cifar10_tiny') else: data_dir = '../data/cifar-10/' #@save def read_csv_labels(fname): """Read fname to return a name to label dictionary.""" with open(fname, 'r') as f: # Skip the file header line (column name) lines = f.readlines()[1:] tokens = [l.rstrip().split(',') for l in lines] return dict(((name, label) for name, label in tokens)) labels = read_csv_labels(os.path.join(data_dir, 'trainLabels.csv'))