def load_data_fashion_mnist(batch_size, resize=None): """Download the Fashion-MNIST dataset and then load into memory.""" dataset = gluon.data.vision trans = [dataset.transforms.Resize(resize)] if resize else [] trans.append(dataset.transforms.ToTensor()) trans = dataset.transforms.Compose(trans) mnist_train = dataset.FashionMNIST(train=True).transform_first(trans) mnist_test = dataset.FashionMNIST(train=False).transform_first(trans) return (gluon.data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=d2l.get_dataloader_workers()), gluon.data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=d2l.get_dataloader_workers()))
def load_data(file_path, batch_size): """Load tabular feature data.""" invalid_thd = analyze_valid_threshold(file_path) log.info('Invalid throughput is set to %.1f GFLOP/s', invalid_thd) log.info('Parsing file...') with open(file_path, 'r') as filep: next(filep) # Get rid of headers # Parse features to sequence. features = [] valids = [] for line in tqdm.tqdm(filep): tokens = line.replace('\n', '').split(',') features.append([float(v) for v in tokens[:-1]]) valids.append(1 if float(tokens[-1]) > invalid_thd else 0) log.info('Total data size %d', len(features)) # 70% for training. # 10% for validation. # 20% for testing. splitter1 = int(len(features) * 0.7) splitter2 = int(len(features) * 0.8) train_feats = np.array(features[:splitter1]) train_valid = np.array(valids[:splitter1]) # Calculate imbalance weight. num_valid = len(train_valid.nonzero()[0]) num_invalid = len(train_valid) - num_valid pos_weight = num_invalid / num_valid train_iter = gluon.data.DataLoader( gluon.data.ArrayDataset(train_feats, train_valid), batch_size, shuffle=True, num_workers=d2l.get_dataloader_workers()) validate_feats = np.array(features[splitter1:splitter2]) validate_valids = np.array(valids[splitter1:splitter2]) test_feats = np.array(features[splitter2:]) test_valids = np.array(valids[splitter2:]) return train_iter, pos_weight, validate_feats, validate_valids, test_feats, test_valids
npx.set_np() d2l.use_svg_display() mnist_train = gluon.data.vision.FashionMNIST(train=True) mnist_test = gluon.data.vision.FashionMNIST(train=False) print("train length : {}, test length: {}".format(len(mnist_train), len(mnist_test))) X, y = mnist_train[:18] # d2l.show_images(X.squeeze(axis=-1), 2, 9, titles=d2l.get_fashion_mnist_labels(y)) # plt.show() batch_size = 256 transformer = gluon.data.vision.transforms.ToTensor() train_iter = gluon.data.DataLoader(mnist_train.transform_first(transformer), batch_size, shuffle=True, num_workers=d2l.get_dataloader_workers()) timer = d2l.Timer() for X, y in train_iter: continue print("loading dada takes {:.2f} sec".format(timer.stop())) train_iter, test_iter = d2l.load_data_fashion_mnist(32, (64, 64)) for X, y in train_iter: print(X.shape) break
def load_data(file_path, batch_size, num_hiddens): """Load tabular feature data.""" log.info('Parsing file...') with open(file_path, 'r') as filep: next(filep) # Get rid of headers # Parse features to sequence. num_seq = num_feature + 1 features = [] thrpts = [] for line in tqdm.tqdm(filep): tokens = line.replace('\n', '').split(',') # initial <CLS> to 0 features.append([0] + [float(v) for v in tokens[:-1]]) thrpts.append(float(tokens[-1])) # Expand featues to (batch, sequence, hidden) log.info('Expanding features...') with ProcessPoolExecutor(max_workers=8) as pool: expand_features = [] for start in tqdm.tqdm( range(0, len(features), 8), bar_format='{desc}{percentage:3.0f}%|{bar:50}{r_bar}'): futures = [ pool.submit(expand_hidden, feature=feature, num_hiddens=num_hiddens) for feature in features[start:min(start + 8, len(features))] ] for future in as_completed(futures): expand_features.append(future.result()) features = expand_features log.info('Total data size %d', len(features)) # 70% for training. # 10% for validation. # 20% for testing. splitter1 = int(len(features) * 0.7) splitter2 = int(len(features) * 0.8) train_feats = np.array(features[:splitter1]) train_thrpts = np.array(thrpts[:splitter1]) # Make valid labels labels = np.array( [1 if thrpt >= INVALID_THD else 0 for thrpt in train_thrpts]) # Standardize training thrpts thrpt_avg, thrpt_std = train_thrpts.mean().tolist(), train_thrpts.std( ).tolist() # log.info('Train thrpt avd std: %.2f %.2f', thrpt_avg, thrpt_std) # train_thrpts = (train_thrpts - thrpt_avg) / thrpt_std # log.info('Standardized Thrpt range %.2f, %.2f', min(train_thrpts), max(train_thrpts)) # Normalize training thrpts thrpt_avg = min(train_thrpts.nonzero()).tolist()[0] thrpt_std = max(train_thrpts).tolist() - thrpt_avg log.info('Train thrpt min range: %.2f %.2f', thrpt_avg, thrpt_std) train_thrpts = (train_thrpts - thrpt_avg) / thrpt_std # Statistics buckets = [0 for _ in range(11)] for thrpt in train_thrpts: buckets[int(thrpt * 10)] += 1 log.info('Training thrpt distributions') for idx, bucket in enumerate(buckets): print('%d: %d' % (idx, bucket)) # Calculate imbalance weight. num_valid = sum(labels).tolist() num_invalid = len(train_thrpts) - num_valid pos_weight = num_invalid / num_valid log.info('Valid %.2f : Invalid %.2f', num_valid / len(train_thrpts), num_invalid / len(train_thrpts)) # Make log to training outputs. # train_thrpts = np.log(train_thrpts + 1e-6) train_iter = gluon.data.DataLoader( gluon.data.ArrayDataset(train_feats, train_thrpts, labels), batch_size, shuffle=True, num_workers=d2l.get_dataloader_workers()) validate_feats = np.array(features[splitter1:splitter2]) validate_thrpts = np.array(thrpts[splitter1:splitter2]) test_feats = np.array(features[splitter2:]) test_thrpts = np.array(thrpts[splitter2:]) return (train_iter, pos_weight, validate_feats, validate_thrpts, test_feats, test_thrpts, thrpt_avg, thrpt_std)
def load_data(file_path, batch_size, num_cls): """Load tabular feature data.""" log.info('Parsing file...') with open(file_path, 'r') as filep: next(filep) # Get rid of headers # Parse features to sequence. num_seq = num_feature + 1 features = [] thrpts = [] for line in tqdm.tqdm(filep): tokens = line.replace('\n', '').split(',') # Filter out invalid records thrpt = float(tokens[-1]) if thrpt <= INVALID_THD: continue # initial <CLS> to 0 features.append([0] + [float(v) for v in tokens[:-1]]) thrpts.append(thrpt) log.info('Total data size %d', len(features)) # Data balancing # 70% for training. # 10% for validation. # 20% for testing. splitter1 = int(len(features) * 0.7) splitter2 = int(len(features) * 0.8) train_feats = np.array(features[:splitter1]) train_thrpts = np.array(thrpts[:splitter1]) log.info('Train thrpt min max: %.2f %.2f', min(train_thrpts).tolist(), max(train_thrpts).tolist()) # Identify throughput class boundaries. sorted_thrpts = [e.tolist() for e in sorted(train_thrpts)] #np.unique( cls_size = len(sorted_thrpts) // num_cls boundaries = [sorted_thrpts[-1]] for ridx in range(num_cls - 1, 0, -1): boundaries.append(sorted_thrpts[ridx * cls_size]) boundaries.reverse() # Transform throughputs to classes. log.info('Transforming throughput to class...') cls_thrpts = [] with ProcessPoolExecutor(max_workers=8) as pool: for start in tqdm.tqdm( range(0, len(thrpts), 8), bar_format='{desc}{percentage:3.0f}%|{bar:50}{r_bar}'): futures = [ pool.submit(find_class, thrpt=thrpt, boundaries=boundaries) for thrpt in thrpts[start:min(start + 8, len(thrpts))] ] for future in as_completed(futures): cls_thrpts.append(future.result()) train_thrpts = np.array(cls_thrpts[:splitter1], dtype='int32') # Statistics buckets = [0 for _ in range(num_cls)] for thrpt_cls in train_thrpts: buckets[thrpt_cls.tolist()] += 1 log.debug('Training throughput distributions') for idx, (boundary, bucket) in enumerate(zip(boundaries, buckets)): log.debug('\t%02d (<=%.2f): %d', idx, boundary, bucket) train_iter = gluon.data.DataLoader( gluon.data.ArrayDataset(train_feats, train_thrpts), batch_size, shuffle=True, num_workers=d2l.get_dataloader_workers()) validate_feats = np.array(features[splitter1:splitter2]) validate_thrpts = np.array(cls_thrpts[splitter1:splitter2], dtype='int32') test_feats = np.array(features[splitter2:]) test_thrpts = np.array(cls_thrpts[splitter2:], dtype='int32') return (train_iter, validate_feats, validate_thrpts, test_feats, test_thrpts, boundaries)