def sequential_synthetic_dataset(root_path, dataset_name, data_type): """Generate a synthetic dataset for regression.""" if data_type == "dense": X, y = make_classification( n_samples=10000, n_features=100, n_informative=90, n_classes=2, random_state=42, ) else: raise NotImplementedError( "{} synthetic dataset is " "not supported.".format(data_type) ) data = LIBSVMDataset(X, y, False) lmdb_file_path = os.path.join( root_path, "{}_{}.lmdb".format(dataset_name, data_type) ) ds1 = PrefetchDataZMQ( data, ) LMDBSerializer.save(ds1, lmdb_file_path) print("Dumped dataflow to {} for {}".format(lmdb_file_path, dataset_name))
def sequential_epsilon_or_rcv1(root_path, name, data_type, is_sparse): data = LIBSVMDataset(root_path, name, data_type, is_sparse) lmdb_file_path = join(root_path, '{}_{}.lmdb'.format(name, data_type)) print('dump_dataflow_to_lmdb for {}'.format(lmdb_file_path)) ds1 = PrefetchDataZMQ(data, nr_proc=1) LMDBSerializer.save(ds1, lmdb_file_path)
def sequential_epsilon_or_rcv1(root_path, name, data_type): features, labels, is_sparse = _load_libsvm_data(root_path, name, data_type) data = LIBSVMDataset(features, labels, is_sparse) lmdb_file_path = os.path.join(root_path, "{}_{}.lmdb".format(name, data_type)) ds1 = PrefetchDataZMQ(data) LMDBSerializer.save(ds1, lmdb_file_path) print("Dumped dataflow to {} for {}".format(lmdb_file_path, name))
def sequential_downsampled_imagenet(args): data = DownsampledImageNet(args.data_dir, args.data_type, args.img_size) lmdb_file_path = os.path.join( args.data_dir, f"imagenet{args.img_size}_{args.data_type}.lmdb") # delete file if exists. if os.path.exists(lmdb_file_path) and args.force_delete == 1: os.remove(lmdb_file_path) # serialize to the target path. ds1 = PrefetchDataZMQ(data, num_proc=1) LMDBSerializer.save(ds1, lmdb_file_path)
def sequential_synthetic_dataset(root_path, dataset_name): """Generate a synthetic dataset for regression.""" if 'dense' in dataset_name: X, y = make_classification(n_samples=10000, n_features=100, n_informative=90, n_classes=2, random_state=42) else: raise NotImplementedError("{} synthetic dataset is not supported.".format(dataset_name)) data = SyntheticLIBSVMDataset(X, y) lmdb_file_path = join(root_path, '{}.lmdb'.format(dataset_name)) print('dump_dataflow_to_lmdb for {}'.format(lmdb_file_path)) ds1 = PrefetchDataZMQ(data, nr_proc=1) LMDBSerializer.save(ds1, lmdb_file_path)
def compute_mean_std(db, fname): ds = LMDBSerializer.load(db, shuffle=False) ds.reset_state() o = OnlineMoments() for dp in get_tqdm(ds): feat = dp[0] # len x dim for f in feat: o.feed(f) logger.info("Writing to {} ...".format(fname)) with open(fname, 'wb') as f: f.write(serialize.dumps([o.mean, o.std]))
def getdata(path, isTrain): ds = LMDBSerializer.load(path, shuffle=isTrain) # Graph Benchmark # ds=FakeData([[10,10],[10,10],[10,10],[10,10],[10],[10],[10,10],[1],[1],[1]], 1000, random=False,dtype=['int32', 'int32', 'int32', 'int32', 'int32', 'int32',\ # 'int32', 'int32', 'int32', 'int32'], domain=[(0, 100), (0, 120),(0,120),(0,1),(0,100),(0,100),(0,100),(0,52),(0,115),(0,115)]) ds = getbatch(ds, 32, isTrain) if isTrain: ds = MultiProcessRunnerZMQ(ds, 4) return ds
def __init__(self, file_location, batch_size, train=True, shuffle=True, full=False, batch_from_disk=150): self.batch_size = batch_size self.train = train if train: self.ds = MyLMDBSerializer.load(file_location, shuffle=shuffle, batch_from_disk=batch_from_disk) self.ds = MyLocallyShuffleData(self.ds, buffer_size=10000, shuffle_interval=500) self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000) self.len_ = 1281167 else: self.ds = LMDBSerializer.load(file_location, shuffle=False) self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000) self.len_ = 50000 self.ds.reset_state() self.batches_in_epoch = int(math.ceil(self.len_ / self.batch_size))
def __init__(self, mode, batch_size=256, shuffle=False, num_workers=25, cache=50000, device='cuda'): # enumerate standard imagenet augmentors imagenet_augmentors = fbresnet_augmentor(mode == 'train') # load the lmdb if we can find it base_dir = '/userhome/cs/u3003679/' lmdb_loc = os.path.join(base_dir, 'ILSVRC-{}.lmdb'.format(mode)) #lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb'%mode) ds = LMDBSerializer.load(lmdb_loc, shuffle=shuffle) ds = LocallyShuffleData(ds, cache) # ds = td.LMDBDataPoint(ds) def f(dp): x, label = dp x = cv2.imdecode(x, cv2.IMREAD_COLOR) for aug in imagenet_augmentors: x = aug.augment(x) return x, label ds = MultiProcessMapDataZMQ(ds, num_proc=8, map_func=f) # ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) # ds = AugmentImageComponent(ds, imagenet_augmentors) # ds = td.PrefetchData(ds, 5000, 1) # ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) # ds = td.AugmentImageComponent(ds, imagenet_augmentors) # ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = BatchData(ds, batch_size) # self.ds = MultiProcessRunnerZMQ(self.ds, 4) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.device = device
def getdata(path, isTrain): ds = LMDBSerializer.load(path, shuffle=isTrain) ds = getbatch(ds, 64, isTrain) if isTrain: ds = MultiProcessRunnerZMQ(ds, 4) return ds
TailLabel, ] yield output if __name__ == "__main__": parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(title="command", dest="command") parser_db = subparsers.add_parser("build", help="build train/test database") parser_db.add_argument("--dataset", help="path to train/test data", required=True) parser_db.add_argument("--db", help="output lmdb file", required=True) parser_eval = subparsers.add_parser("eval", help="bulid p@n eval database") parser_eval.add_argument("--dataset", help="path to eval data", required=True) parser_eval.add_argument("--db", help="output eval lmdb file", required=True) args = parser.parse_args() if args.command == "build": data = pickle.load(open(args.dataset, "rb")) ds = Raw(data) LMDBSerializer.save(ds, args.db) elif args.command == "eval": data = pickle.load(open(args.dataset, "rb")) ds = Raw(data) LMDBSerializer.save(ds, args.db)
ds.reset_state() o = OnlineMoments() for dp in get_tqdm(ds): feat = dp[0] # len x dim for f in feat: o.feed(f) logger.info("Writing to {} ...".format(fname)) with open(fname, 'wb') as f: f.write(serialize.dumps([o.mean, o.std])) if __name__ == '__main__': parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(title='command', dest='command') parser_db = subparsers.add_parser('build', help='build a LMDB database') parser_db.add_argument('--dataset', help='path to TIMIT TRAIN or TEST directory', required=True) parser_db.add_argument('--db', help='output lmdb file', required=True) parser_stat = subparsers.add_parser('stat', help='compute statistics (mean/std) of dataset') parser_stat.add_argument('--db', help='input lmdb file', required=True) parser_stat.add_argument('-o', '--output', help='output statistics file', default='stats.data') args = parser.parse_args() if args.command == 'build': ds = RawTIMIT(args.dataset) LMDBSerializer.save(ds, args.db) elif args.command == 'stat': compute_mean_std(args.db, args.output)
'char': sentiment_c, 'num': m }, open('./data/sentiment.pkl', 'wb')) # make train/dev/test for i in range(10): train_ori = get_train(data, 10, i) test_ori = get_test(data, 10, i) train = [] dev = [] test = [] for j in range(2): random.shuffle(train_ori[j]) x = len(train_ori[j]) * 9 // 10 train += train_ori[j][:x] dev += train_ori[j][x:] test += test_ori random.shuffle(train) random.shuffle(dev) random.shuffle(test) train_ = process_data(train, word2id, char2id) dev_ = process_data(dev, word2id, char2id) test_ = process_data(test, word2id, char2id) train_data = MEANdata(train_) dev_data = MEANdata(dev_) test_data = MEANdata(test_) os.system('mkdir mdb%s' % i) LMDBSerializer.save(train_data, './mdb{}/train.mdb'.format(i)) LMDBSerializer.save(dev_data, './mdb{}/dev.mdb'.format(i)) LMDBSerializer.save(test_data, './mdb{}/test.mdb'.format(i))
# -------------------- parser.add_argument('--imagenet_folder') parser.add_argument('--val', action='store_true') parser.add_argument('--train', action='store_true') parser.add_argument('--lmdb_file', type=str) args = parser.parse_args() if args.val and args.train: print( "Train and Validation options are mutually exclusive! Chose only one." ) if args.val: print( "We are generating the lmdb file containing validation images of imagenet." ) print(f"The file will be saved at {args.lmdb_file}.lmdb") ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'val') ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1) LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb") elif args.train: print( "We are generating the lmdb file containing training images of imagenet." ) print(f"The file will be saved at {args.lmdb_file}.lmdb") ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'train') ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1) LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")
self.count = 0 def __len__(self): return len(self.data) def __iter__(self): for bag in data: # X = bag['X'] # Pos1 = bag['Pos1'] # Pos2 = bag['Pos2'] # DepMask = bag['DepMask'] HeadPos = bag["HeadPos"] TailPos = bag["TailPos"] if max(max(bag["HeadPos"]), max(bag["TailPos"])) > 100: self.count += 1 # DepLabel = bag['Dep'] # ReLabel = bag['Y'] # HeadLabel = bag['HeadLabel'] # TailLabel = bag['TailLabel'] # output = [X, Pos1, Pos2, DepMask, HeadPos, TailPos, DepLabel, ReLabel, HeadLabel, TailLabel] output = [HeadPos, TailPos] yield output if __name__ == "__main__": data = pickle.load(open("/data/PKL/train.pkl", "rb")) ds = Raw(data) LMDBSerializer.save(ds, "/data/MLRE/testpkl") print(ds.count)
def getdata(path, batchsize, isTrain): ds = LMDBSerializer.load(path, shuffle=isTrain) ds = getbatch(ds, batchsize, isTrain) # if isTrain: # ds = MultiProcessRunnerZMQ(ds, 2) return ds
parser.add_argument('database_dir', type=str, default=None, help='location to save output database') args = parser.parse_args() class BinaryILSVRC12(dataset.ILSVRC12Files): def get_data(self): for fname, label in super(BinaryILSVRC12, self).__iter__(): with open(fname, 'rb') as f: jpeg = f.read() jpeg = np.asarray(bytearray(jpeg), dtype='uint8') yield [jpeg, label] if args.database_dir is None: lmdb_path = args.imagenet else: lmdb_path = args.database_dir os.environ['TENSORPACK_DATASET'] = os.path.join(lmdb_path, "tensorpack_data") if not os.path.exists(os.environ['TENSORPACK_DATASET']): os.mkdir(os.environ['TENSORPACK_DATASET']) for name in ['train', 'val']: db_filename = 'ILSVRC-%s.lmdb' % name db_loc = os.path.join(lmdb_path, db_filename) print(f"Processing {args.imagenet} {name} to {db_loc}...") ds0 = BinaryILSVRC12(args.imagenet, name) ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1) LMDBSerializer.save(ds1, db_loc)