def sequential_synthetic_dataset(root_path, dataset_name, data_type): """Generate a synthetic dataset for regression.""" if data_type == "dense": X, y = make_classification( n_samples=10000, n_features=100, n_informative=90, n_classes=2, random_state=42, ) else: raise NotImplementedError( "{} synthetic dataset is " "not supported.".format(data_type) ) data = LIBSVMDataset(X, y, False) lmdb_file_path = os.path.join( root_path, "{}_{}.lmdb".format(dataset_name, data_type) ) ds1 = PrefetchDataZMQ( data, ) LMDBSerializer.save(ds1, lmdb_file_path) print("Dumped dataflow to {} for {}".format(lmdb_file_path, dataset_name))
def sequential_epsilon_or_rcv1(root_path, name, data_type, is_sparse): data = LIBSVMDataset(root_path, name, data_type, is_sparse) lmdb_file_path = join(root_path, '{}_{}.lmdb'.format(name, data_type)) print('dump_dataflow_to_lmdb for {}'.format(lmdb_file_path)) ds1 = PrefetchDataZMQ(data, nr_proc=1) LMDBSerializer.save(ds1, lmdb_file_path)
def sequential_epsilon_or_rcv1(root_path, name, data_type): features, labels, is_sparse = _load_libsvm_data(root_path, name, data_type) data = LIBSVMDataset(features, labels, is_sparse) lmdb_file_path = os.path.join(root_path, "{}_{}.lmdb".format(name, data_type)) ds1 = PrefetchDataZMQ(data) LMDBSerializer.save(ds1, lmdb_file_path) print("Dumped dataflow to {} for {}".format(lmdb_file_path, name))
def sequential_downsampled_imagenet(args): data = DownsampledImageNet(args.data_dir, args.data_type, args.img_size) lmdb_file_path = os.path.join( args.data_dir, f"imagenet{args.img_size}_{args.data_type}.lmdb") # delete file if exists. if os.path.exists(lmdb_file_path) and args.force_delete == 1: os.remove(lmdb_file_path) # serialize to the target path. ds1 = PrefetchDataZMQ(data, num_proc=1) LMDBSerializer.save(ds1, lmdb_file_path)
def sequential_synthetic_dataset(root_path, dataset_name): """Generate a synthetic dataset for regression.""" if 'dense' in dataset_name: X, y = make_classification(n_samples=10000, n_features=100, n_informative=90, n_classes=2, random_state=42) else: raise NotImplementedError("{} synthetic dataset is not supported.".format(dataset_name)) data = SyntheticLIBSVMDataset(X, y) lmdb_file_path = join(root_path, '{}.lmdb'.format(dataset_name)) print('dump_dataflow_to_lmdb for {}'.format(lmdb_file_path)) ds1 = PrefetchDataZMQ(data, nr_proc=1) LMDBSerializer.save(ds1, lmdb_file_path)
self.count = 0 def __len__(self): return len(self.data) def __iter__(self): for bag in data: # X = bag['X'] # Pos1 = bag['Pos1'] # Pos2 = bag['Pos2'] # DepMask = bag['DepMask'] HeadPos = bag["HeadPos"] TailPos = bag["TailPos"] if max(max(bag["HeadPos"]), max(bag["TailPos"])) > 100: self.count += 1 # DepLabel = bag['Dep'] # ReLabel = bag['Y'] # HeadLabel = bag['HeadLabel'] # TailLabel = bag['TailLabel'] # output = [X, Pos1, Pos2, DepMask, HeadPos, TailPos, DepLabel, ReLabel, HeadLabel, TailLabel] output = [HeadPos, TailPos] yield output if __name__ == "__main__": data = pickle.load(open("/data/PKL/train.pkl", "rb")) ds = Raw(data) LMDBSerializer.save(ds, "/data/MLRE/testpkl") print(ds.count)
TailLabel, ] yield output if __name__ == "__main__": parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(title="command", dest="command") parser_db = subparsers.add_parser("build", help="build train/test database") parser_db.add_argument("--dataset", help="path to train/test data", required=True) parser_db.add_argument("--db", help="output lmdb file", required=True) parser_eval = subparsers.add_parser("eval", help="bulid p@n eval database") parser_eval.add_argument("--dataset", help="path to eval data", required=True) parser_eval.add_argument("--db", help="output eval lmdb file", required=True) args = parser.parse_args() if args.command == "build": data = pickle.load(open(args.dataset, "rb")) ds = Raw(data) LMDBSerializer.save(ds, args.db) elif args.command == "eval": data = pickle.load(open(args.dataset, "rb")) ds = Raw(data) LMDBSerializer.save(ds, args.db)
'char': sentiment_c, 'num': m }, open('./data/sentiment.pkl', 'wb')) # make train/dev/test for i in range(10): train_ori = get_train(data, 10, i) test_ori = get_test(data, 10, i) train = [] dev = [] test = [] for j in range(2): random.shuffle(train_ori[j]) x = len(train_ori[j]) * 9 // 10 train += train_ori[j][:x] dev += train_ori[j][x:] test += test_ori random.shuffle(train) random.shuffle(dev) random.shuffle(test) train_ = process_data(train, word2id, char2id) dev_ = process_data(dev, word2id, char2id) test_ = process_data(test, word2id, char2id) train_data = MEANdata(train_) dev_data = MEANdata(dev_) test_data = MEANdata(test_) os.system('mkdir mdb%s' % i) LMDBSerializer.save(train_data, './mdb{}/train.mdb'.format(i)) LMDBSerializer.save(dev_data, './mdb{}/dev.mdb'.format(i)) LMDBSerializer.save(test_data, './mdb{}/test.mdb'.format(i))
# -------------------- parser.add_argument('--imagenet_folder') parser.add_argument('--val', action='store_true') parser.add_argument('--train', action='store_true') parser.add_argument('--lmdb_file', type=str) args = parser.parse_args() if args.val and args.train: print( "Train and Validation options are mutually exclusive! Chose only one." ) if args.val: print( "We are generating the lmdb file containing validation images of imagenet." ) print(f"The file will be saved at {args.lmdb_file}.lmdb") ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'val') ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1) LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb") elif args.train: print( "We are generating the lmdb file containing training images of imagenet." ) print(f"The file will be saved at {args.lmdb_file}.lmdb") ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'train') ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1) LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")
ds.reset_state() o = OnlineMoments() for dp in get_tqdm(ds): feat = dp[0] # len x dim for f in feat: o.feed(f) logger.info("Writing to {} ...".format(fname)) with open(fname, 'wb') as f: f.write(serialize.dumps([o.mean, o.std])) if __name__ == '__main__': parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(title='command', dest='command') parser_db = subparsers.add_parser('build', help='build a LMDB database') parser_db.add_argument('--dataset', help='path to TIMIT TRAIN or TEST directory', required=True) parser_db.add_argument('--db', help='output lmdb file', required=True) parser_stat = subparsers.add_parser('stat', help='compute statistics (mean/std) of dataset') parser_stat.add_argument('--db', help='input lmdb file', required=True) parser_stat.add_argument('-o', '--output', help='output statistics file', default='stats.data') args = parser.parse_args() if args.command == 'build': ds = RawTIMIT(args.dataset) LMDBSerializer.save(ds, args.db) elif args.command == 'stat': compute_mean_std(args.db, args.output)
parser.add_argument('database_dir', type=str, default=None, help='location to save output database') args = parser.parse_args() class BinaryILSVRC12(dataset.ILSVRC12Files): def get_data(self): for fname, label in super(BinaryILSVRC12, self).__iter__(): with open(fname, 'rb') as f: jpeg = f.read() jpeg = np.asarray(bytearray(jpeg), dtype='uint8') yield [jpeg, label] if args.database_dir is None: lmdb_path = args.imagenet else: lmdb_path = args.database_dir os.environ['TENSORPACK_DATASET'] = os.path.join(lmdb_path, "tensorpack_data") if not os.path.exists(os.environ['TENSORPACK_DATASET']): os.mkdir(os.environ['TENSORPACK_DATASET']) for name in ['train', 'val']: db_filename = 'ILSVRC-%s.lmdb' % name db_loc = os.path.join(lmdb_path, db_filename) print(f"Processing {args.imagenet} {name} to {db_loc}...") ds0 = BinaryILSVRC12(args.imagenet, name) ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1) LMDBSerializer.save(ds1, db_loc)