def __setstate__(self, d): self.rootdir = d['rootdir'] self.columns = d['columns'] self.blocks = dict((col, bcolz.carray(rootdir=os.path.join(self.rootdir, '%s.bcolz' % col))) for col in self.columns) self.index = bcolz.carray(rootdir=os.path.join(self.rootdir, 'index.bcolz')) self._explicitly_given_path = True
def factorize_groupby_cols(self, groupby_cols): """ :type self: ctable """ # first check if the factorized arrays already exist # unless we need to refresh the cache factor_list = [] values_list = [] # factorize the groupby columns for col in groupby_cols: if self.cache_valid(col): col_rootdir = self[col].rootdir col_factor_rootdir = col_rootdir + '.factor' col_values_rootdir = col_rootdir + '.values' col_factor_carray = \ bcolz.carray(rootdir=col_factor_rootdir, mode='r') col_values_carray = \ bcolz.carray(rootdir=col_values_rootdir, mode='r') else: col_factor_carray, values = ctable_ext.factorize(self[col]) col_values_carray = \ bcolz.carray(values.values(), dtype=self[col].dtype) factor_list.append(col_factor_carray) values_list.append(col_values_carray) return factor_list, values_list
def resource_bcolz(uri, dshape=None, expected_dshape=None, **kwargs): if os.path.exists(uri): try: return ctable(rootdir=uri) except IOError: # __rootdirs__ doesn't exist because we aren't a ctable return carray(rootdir=uri) else: if not dshape: raise ValueError("Must specify either existing bcolz directory or" " valid datashape") dshape = datashape.dshape(dshape) dt = datashape.to_numpy_dtype(dshape) shape_tail = tuple(map(int, dshape.shape[1:])) # tail of shape if dshape.shape[0] == datashape.var: shape = (0,) + shape_tail else: shape = (int(dshape.shape[0]),) + shape_tail x = np.empty(shape=shape, dtype=dt) kwargs = keyfilter(keywords.__contains__, kwargs) expectedlen = kwargs.pop('expectedlen', int(expected_dshape[0]) if expected_dshape is not None and isinstance(expected_dshape[0], datashape.Fixed) else None) if datashape.predicates.isrecord(dshape.measure): return ctable(x, rootdir=uri, expectedlen=expectedlen, **kwargs) else: return carray(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
def _from_carray(path, format_categories=None, format_codes=None, format_values=None): meta = json.load(open(os.path.join(path, 'meta'), 'r')) if meta['type'] == 'category': if format_categories in ['npz', 'npy']: filename = os.path.join(path, 'categories.%s' % format_categories) with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)): categories_values = numpy.load(filename, mmap_mode='r+') # TODO npz not memmap? if format_categories == 'npz': categories_values = categories_values['arr_0'] elif format_categories == 'pickle': filename = os.path.join(path, 'categories.pickle') with log.timedlogger("reading [%s] %s" % (meta['name'], filename)): categories_values = pickle.load(open(filename, 'rb')) elif format_categories == 'bcolz': rootdir = os.path.join(path, 'categories.bcolz') with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)): categories_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') # categories_values = bcolz.carray(rootdir=rootdir, mode='r')[:] else: raise NotImplementedError("uh oh %s" % (meta['type'],)) if format_codes == 'bcolz': rootdir = os.path.join(path, 'codes.bcolz') with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)): codes_values = bcolz.carray(rootdir=rootdir, mode='r')[:] # , categories=categories_values) # codes_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') # , categories=categories_values) elif format_codes == 'npy': filename = os.path.join(path, 'codes.npy') with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)): codes_values = numpy.load(filename, mmap_mode='r+') else: raise Exception("unknown format_codes type %s" % (format_codes,)) with log.timedlogger("FastCat construction"): s = FastCat(codes_values, categories_values) else: if format_values == 'bcolz': rootdir = os.path.join(path, 'values.bcolz') with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)): # values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') s = bcolz.carray(rootdir=rootdir, mode='r')[:] elif format_values == 'npy': filename = os.path.join(path, 'values.npy') with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)): s = numpy.load(filename, mmap_mode='r+') elif format_values == 'pickle': filename = os.path.join(path, 'values.pickle') with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)): s = pickle.load(open(filename, 'rb')) # with log.timedlogger("FastSeries construction"): # index = pandas.Index(numpy.arange(len(values)), copy=False) # values = SingleBlockManager(values, index, fastpath=True) # s = pandas.Series(data=values, fastpath=True, copy=False, dtype=meta['type']) # s = values # [:] # logging.warning('Constructing categorical for %s' % meta['name']) # s = pandas.Categorical.from_codes(codes_values, categories_values, name=meta['name']) return meta, s # codes_values, categories_values
def build_carray(array, rootdir): """ Used in ctable.__reduce__ Pickling functions can't be in pyx files. Putting this tiny helper function here instead. """ from bcolz import carray if rootdir: return carray(rootdir=rootdir) else: return carray(array)
def to_dict_of_blocks(d, rootdir): """ deprecated. for pure numpy things like {'X_train': X_train, 'X_test': X_test} """ if os.path.exists(rootdir): _move_and_remove_nonblocking(rootdir) _mkdir(rootdir) meta = {'keys': list(d.keys())} json.dump(meta, open(os.path.join(rootdir, 'meta'), 'w')) for i, k in enumerate(meta['keys']): filename = os.path.join(rootdir, str(i)) with log.timedlogger('writing {} ({}.shape = {})'.format(filename, k, d[k].shape)): bcolz.carray(d[k], rootdir=filename)
def handle_frame(self, i, frame): if i == 0: self.frames = bcolz.carray(np.zeros((0,) + frame.coords.shape, dtype="float32"), rootdir=os.path.join(self.rootdir, "coords"), mode='w') self.frames.attrs['timestamp'] = self.timestamp self.boxes = bcolz.carray(np.zeros((0,) + frame.box.shape, dtype="float32"), rootdir=os.path.join(self.rootdir, "boxes"), mode='w') self.times = [] self.frames.append(frame.coords) self.boxes.append(frame.box) self.times.append(frame.time)
def unique(self, col_or_col_list): """ Return a list of unique values of a column or a list of lists of column list :param col_or_col_list: a column or a list of columns :return: """ if isinstance(col_or_col_list, list): col_is_list = True col_list = col_or_col_list else: col_is_list = False col_list = [col_or_col_list] output = [] for col in col_list: if self.cache_valid(col): # retrieve values from existing disk-based factorization col_values_rootdir = self[col].rootdir + '.values' carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r') values = list(carray_values) else: # factorize on-the-fly _, values = ctable_ext.factorize(self[col]) values = values.values() output.append(values) if not col_is_list: output = output[0] return output
def open(rootdir, mode='a'): """ open(rootdir, mode='a') Open a disk-based carray/ctable. Parameters ---------- rootdir : pathname (string) The directory hosting the carray/ctable object. mode : the open mode (string) Specifies the mode in which the object is opened. The supported values are: * 'r' for read-only * 'w' for emptying the previous underlying data * 'a' for allowing read/write on top of existing data Returns ------- out : a carray/ctable object or None (if not objects are found) """ # First try with a carray obj = None try: obj = bcolz.carray(rootdir=rootdir, mode=mode) except IOError: # Not a carray. Now with a ctable try: obj = bcolz.ctable(rootdir=rootdir, mode=mode) except IOError: # Not a ctable pass return obj
def setChannelData(self, channelName, data, compression=False): """Modifies data of channel Parameters ---------------- channelName : str channel name data : numpy array channel data compression : bool or str trigger for data compression """ if compression and CompressionPossible: if not isinstance(compression, str): if isinstance(compression, int): comp = compression else: comp = self._compression_level temp = carray(data, cparams=cparams(clevel=comp), expectedlen=int(getsizeof(data) / 10)) else: temp = compressed_data() temp.compression(data) self._setChannel(channelName, temp, field=dataField) else: self._setChannel(channelName, data, field=dataField)
def test_create_unsafe_carray_with_unsafe_data(self): """ We introduce a safe keyword arg which removes dtype checking. We don't want this to interfere with creation. """ b = bcolz.carray([1, 2, 3], dtype='i4', safe=False) self.assertEqual(b.safe, False) self.assertEqual(b[0], 1)
def open(rootdir, mode='a'): """ open(rootdir, mode='a') Open a disk-based carray/ctable. Parameters ---------- rootdir : pathname (string) The directory hosting the carray/ctable object. mode : the open mode (string) Specifies the mode in which the object is opened. The supported values are: * 'r' for read-only * 'w' for emptying the previous underlying data * 'a' for allowing read/write on top of existing data Returns ------- out : a carray/ctable object or IOError (if not objects are found) """ # First try with a carray rootsfile = os.path.join(rootdir, ROOTDIRS) if os.path.exists(rootsfile): return bcolz.ctable(rootdir=rootdir, mode=mode) else: return bcolz.carray(rootdir=rootdir, mode=mode)
def fill(shape, dflt=None, dtype=np.float, **kwargs): """ fill(shape, dtype=float, dflt=None, **kwargs) Return a new carray object of given shape and type, filled with `dflt`. Parameters ---------- shape : int Shape of the new array, e.g., ``(2,3)``. dflt : Python or NumPy scalar The value to be used during the filling process. If None, values are filled with zeros. Also, the resulting carray will have this value as its `dflt` value. dtype : data-type, optional The desired data-type for the array, e.g., `numpy.int8`. Default is `numpy.float64`. kwargs : list of parameters or dictionary Any parameter supported by the carray constructor. Returns ------- out : carray Array filled with `dflt` values with the given shape and dtype. See Also -------- ones, zeros """ dtype = np.dtype(dtype) if type(shape) in _inttypes + (float,): shape = (int(shape),) else: shape = tuple(shape) if len(shape) > 1: # Multidimensional shape. # The atom will have shape[1:] dims (+ the dtype dims). dtype = np.dtype((dtype.base, shape[1:]+dtype.shape)) length = shape[0] # Create the container expectedlen = kwargs.pop("expectedlen", length) if dtype.kind == "V" and dtype.shape == (): raise ValueError("fill does not support ctables objects") obj = bcolz.carray([], dtype=dtype, dflt=dflt, expectedlen=expectedlen, **kwargs) chunklen = obj.chunklen # Then fill it # We need an array for the default so as to keep the atom info dflt = np.array(obj.dflt, dtype=dtype) # Making strides=(0,) below is a trick to create the array fast and # without memory consumption chunk = np.ndarray(length, dtype=dtype, buffer=dflt, strides=(0,)) obj.append(chunk) obj.flush() return obj
def test00(self): """Testing unicode types (creation)""" a = np.array([[u"aŀle", u"eñe"], [u"açò", u"áèâë"]], dtype="U4") b = bcolz.carray(a) # print "b.dtype-->", b.dtype # print "b->", `b` self.assertTrue(a.dtype == b.dtype.base) assert_array_equal(a, b[:], "Arrays are not equal")
def set_type(self, dtype): if self.dtype != dtype: self.fill_nan(self.nan_value(dtype=dtype)) carray = bcolz.carray(self.carray, dtype= np.dtype(dtype).name) # TODO do it chunk by chunk + check data ct = self._table._ctable col_pos = self.position ct.delcol(self._name) ct.addcol(carray, name=self._name, pos=col_pos)
def into(a, b, **kwargs): if isinstance(a, type): kwargs = keyfilter(carray_keywords.__contains__, kwargs) return carray(b, **kwargs) else: a.append(b) a.flush() return a
def test00(self): """Testing string types (creation)""" a = np.array([["ale", "ene"], ["aco", "ieie"]], dtype="S4") b = bcolz.carray(a) # print "b.dtype-->", b.dtype # print "b->", `b` self.assertTrue(a.dtype == b.dtype.base) assert_array_equal(a, b[:], "Arrays are not equal")
def __init__(self, val): super().__init__() if isinstance(val, bcolz.carray): self._carray = val elif isinstance(val, list) or isinstance(val, np.ndarray): self._carray = bcolz.carray(val, expectedlen=Column.DEFAULT_BLOCK_LEN) else: raise DazzleError("Invalid argument in ResultColumn.%s()" % method_name())
def _write_internal(self, filename, calendar, iterator): """ Internal implementation of write. `iterator` should be an iterator yielding pairs of (asset, ctable). """ total_rows = 0 first_row = {} last_row = {} calendar_offset = {} # Maps column name -> output carray. columns = {k: carray(array([], dtype=uint32)) for k in US_EQUITY_PRICING_BCOLZ_COLUMNS} for asset_id, table in iterator: nrows = len(table) for column_name in columns: if column_name == "id": # We know what the content of this column is, so don't # bother reading it. columns["id"].append(full((nrows,), asset_id, uint32)) continue columns[column_name].append(self.to_uint32(table[column_name][:], column_name)) # Bcolz doesn't support ints as keys in `attrs`, so convert # assets to strings for use as attr keys. asset_key = str(asset_id) # Calculate the index into the array of the first and last row # for this asset. This allows us to efficiently load single # assets when querying the data back out of the table. first_row[asset_key] = total_rows last_row[asset_key] = total_rows + nrows - 1 total_rows += nrows # Calculate the number of trading days between the first date # in the stored data and the first date of **this** asset. This # offset used for output alignment by the reader. # HACK: Index with a list so that we get back an array we can pass # to self.to_uint32. We could try to extract this in the loop # above, but that makes the logic a lot messier. asset_first_day = self.to_uint32(table["day"][[0]], "day")[0] calendar_offset[asset_key] = calendar.get_loc(Timestamp(asset_first_day, unit="s", tz="UTC")) # This writes the table to disk. full_table = ctable( columns=[columns[colname] for colname in US_EQUITY_PRICING_BCOLZ_COLUMNS], names=US_EQUITY_PRICING_BCOLZ_COLUMNS, rootdir=filename, mode="w", ) full_table.attrs["first_row"] = first_row full_table.attrs["last_row"] = last_row full_table.attrs["calendar_offset"] = calendar_offset full_table.attrs["calendar"] = calendar.asi8.tolist() return full_table
def _open_minute_file(self, field, sid): sid = int(sid) try: carray = self._carrays[field][sid] except KeyError: carray = self._carrays[field][sid] = bcolz.carray(rootdir=self._get_carray_path(sid, field), mode="r") return carray
def test01(self): """Testing unicode types (append)""" a = np.ones((300, 4), dtype="U4") b = bcolz.carray([], dtype="U4").reshape((0, 4)) b.append(a) # print "b.dtype-->", b.dtype # print "b->", `b` self.assertTrue(a.dtype == b.dtype.base) assert_array_equal(a, b[:], "Arrays are not equal")
def test_carray_record_as_object(self): src_data = np.empty((10,), dtype=np.dtype('u1,O')) src_data[:] = [(i, 's'*i) for i in range(10)] carr = bcolz.carray(src_data, dtype=np.dtype('O')) self.assertEqual(len(carr.shape), 1) self.assertEqual(len(src_data), carr.shape[0]) for i in range(len(carr)): self.assertEqual(carr[i][0], src_data[i][0]) self.assertEqual(carr[i][1], src_data[i][1])
def test03c(self): """Testing `__getitem()__` method with several slices (III)""" a = np.arange(120 * 1000).reshape((5 * 1000, 4, 3, 2)) b = bcolz.carray(a, rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = (slice(None, None, 3), slice(1, 3, 2), slice(1, 4, 2)) # print "b[sl]->", `b[sl]` self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test05c(self): """Testing `__getitem()__` method with fancy indexing (III)""" a = np.arange(2000).reshape((50, 40)) b = bcolz.carray(a, rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = (slice(None), [0, 2]) # print "b[sl]->", `b[sl]` self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test_carray_1d_source(self): """Testing carray of objects, 1d source""" src_data = ['s'*i for i in range(10)] carr = bcolz.carray(src_data, dtype=np.dtype('O')) self.assertEqual(len(carr.shape), 1) self.assertEqual(len(src_data), carr.shape[0]) for i in range(len(carr)): self.assertEqual(carr[i], src_data[i]) self.assertEqual(carr[i], src_data[i])
def test04c(self): """Testing `__getitem()__` method with shape reduction (III)""" a = np.arange(6000).reshape((50, 40, 3)) b = bcolz.carray(a, rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = (1, slice(1, 4, 2), 2) # print "b[sl]->", `b[sl]` self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test00(self): """Testing sum().""" a = np.arange(1e5).reshape(10, 1e4) sa = a.sum() ac = bcolz.carray(a) sac = ac.sum() #print "numpy sum-->", sa #print "carray sum-->", sac self.assert_(sa.dtype == sac.dtype, "sum() is not working correctly.") self.assert_(sa == sac, "sum() is not working correctly.")
def _open_write(self, data=None): if self._bcolz is None: try: # append self._bcolz = \ bcolz.carray(None, rootdir=self._bcolz_dir(), mode='a', # bcolz conf in case mode='a' semantics change to create, otherwise innocuous chunklen=self.chunklen, expectedlen=self.expectedlen, cparams=self.cparams) except: # create self._bcolz = \ bcolz.carray(data[0:0], rootdir=self._bcolz_dir(), mode='w', chunklen=self.chunklen, expectedlen=self.expectedlen, cparams=self.cparams)
def read_meta_and_open(self): """Read the meta-information and initialize structures.""" # Get the directories of the columns rootsfile = os.path.join(self.rootdir, ROOTDIRS) with open(rootsfile, "rb") as rfile: data = json.loads(rfile.read()) # JSON returns unicode (?) self.names = [str(name) for name in data["names"]] # Initialize the cols by instantiating the carrays for name, dir_ in data["dirs"].items(): self._cols[str(name)] = bcolz.carray(rootdir=dir_, mode=self.mode)
def cache_factor(self, col_list, refresh=False): """ Existing todos here are: these should be hidden helper carrays As in: not normal columns that you would normally see as a user The factor (label index) carray is as long as the original carray (and the rest of the table therefore) But the (unique) values carray is not as long (as long as the number of unique values) :param col_list: :param refresh: :return: """ if not self.rootdir: raise TypeError('Only out-of-core ctables can have ' 'factorization caching at the moment') if not isinstance(col_list, list): col_list = [col_list] for col in col_list: # create cache if needed if refresh or not self.cache_valid(col): col_rootdir = self[col].rootdir col_factor_rootdir = col_rootdir + '.factor' col_values_rootdir = col_rootdir + '.values' carray_factor = \ bcolz.carray([], dtype='int64', expectedlen=self.size, rootdir=col_factor_rootdir, mode='w') _, values = \ ctable_ext.factorize(self[col], labels=carray_factor) carray_factor.flush() carray_values = \ bcolz.carray(values.values(), dtype=self[col].dtype, rootdir=col_values_rootdir, mode='w') carray_values.flush()
import numpy as np import bcolz N = 1e8 # the number of elements in x clevel = 5 # the compression level sexpr = "(x-1) < 10." # the expression to compute # sexpr = "((x-1) % 1000) == 0." # the expression to compute #sexpr = "(2*x**3+.3*y**2+z+1)<0" # the expression to compute cparams = bcolz.cparams(clevel) print("Creating inputs...") x = np.arange(N) cx = bcolz.carray(x, cparams=cparams) if 'y' not in sexpr: ct = bcolz.ctable((cx, ), names=['x']) else: y = np.arange(N) z = np.arange(N) cy = bcolz.carray(y, cparams=cparams) cz = bcolz.carray(z, cparams=cparams) ct = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) print("Evaluating...", sexpr) t0 = time() cbout = ct.eval(sexpr) print("Time for evaluation--> %.3f" % (time() - t0, )) print("Converting to numy arrays") bout = cbout[:]
# then get the coordinates sx = 3 * i ex = 3 * (i + 1) xi = np.array(xyz[sx:ex]) xyzi = np.stack([c for c in xi], axis=1) / 100 # have to scale by 100 to match PDB # lastly convert the mask to indices msk_idx = np.map(np.array(list(masks[i])) == '+')[0] # bracket id or get "setting an array element with a sequence" zt = np.array([[id], seq, pssmi, xyzi, msk_idx]) if i == 0: bc = bcolz.carray([zt], rootdir=data_path + 'testing.bc', mode='w', expectedlen=len(ids)) bc.flush() else: bc = bcolz.carray(rootdir=data_path + 'testing.bc', mode='w') bc.append([zt]) bc.flush() # %% from pathlib import Path home = str(Path.home()) pn_path = home + '/Downloads/casp7/casp7/testing' # pn_path = os.curdir + '/../rgn_pytorch/data/text_sample' dataset = ProteinNetDataset(pn_path) trn_data = DataLoader(dataset, batch_size=32, shuffle=True)
def save_array(data_folder, fname, arr): fname = os.path.join(data_folder, fname) print("Saving to {0} ...".format(fname)) c = bcolz.carray(arr, rootdir=fname, mode='w') c.flush()
def load_dataset_face(args, INPUT_SIZE=[112, 112], RGB_MEAN=[0.5, 0.5, 0.5], RGB_STD=[0.5, 0.5, 0.5], val_datasets=[ 'lfw', 'cfp_ff', 'cfp_fp', 'agedb_30', 'calfw', 'cplfw', 'vgg2_fp' ]): train_transform = transforms.Compose([ transforms.Resize( [int(128 * INPUT_SIZE[0] / 112), int(128 * INPUT_SIZE[0] / 112)]), # smaller side resized transforms.RandomCrop([INPUT_SIZE[0], INPUT_SIZE[1]]), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=RGB_MEAN, std=RGB_STD), ]) train_data = dset.ImageFolder( os.path.join(args.data_path, 'CASIA-maxpy-align'), train_transform) weights = torch.DoubleTensor( make_weights_for_balanced_classes(train_data.imgs, len(train_data.classes))) if args.distributed: from catalyst.data.sampler import DistributedSamplerWrapper train_sampler = DistributedSamplerWrapper( torch.utils.data.sampler.WeightedRandomSampler( weights, len(weights))) else: train_sampler = torch.utils.data.sampler.WeightedRandomSampler( weights, len(weights)) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize( [int(128 * INPUT_SIZE[0] / 112), int(128 * INPUT_SIZE[0] / 112)]), transforms.CenterCrop([INPUT_SIZE[0], INPUT_SIZE[1]]), transforms.ToTensor(), transforms.Normalize(mean=RGB_MEAN, std=RGB_STD) ]) val_loaders = [] for name in val_datasets: carray = bcolz.carray(rootdir=os.path.join(args.data_path, name), mode='r') val_data_tensor = torch.tensor(carray[:, [2, 1, 0], :, :]) * 0.5 + 0.5 val_data = TensorsDataset(val_data_tensor, val_transform) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=None) issame = np.load('{}/{}_list.npy'.format(args.data_path, name)) val_loaders.append((name, val_loader, issame)) return train_loader, val_loaders
def fill(shape, dflt=None, dtype=np.float, **kwargs): """fill(shape, dtype=float, dflt=None, **kwargs) Return a new carray or ctable object of given shape and type, filled with `dflt`. Parameters ---------- shape : int Shape of the new array, e.g., ``(2,3)``. dflt : Python or NumPy scalar The value to be used during the filling process. If None, values are filled with zeros. Also, the resulting carray will have this value as its `dflt` value. dtype : data-type, optional The desired data-type for the array, e.g., `numpy.int8`. Default is `numpy.float64`. kwargs : list of parameters or dictionary Any parameter supported by the carray constructor. Returns ------- out : carray or ctable Bcolz object filled with `dflt` values with the given shape and dtype. See Also -------- ones, zeros """ def fill_helper(obj, dtype=None, length=None): """Helper function to fill a carray with default values""" assert isinstance(obj, bcolz.carray) assert dtype is not None assert length is not None if type(length) is float: length = int(length) # Then fill it # We need an array for the default so as to keep the atom info dflt = np.array(obj.dflt, dtype=dtype.base) # Fill chunk with defaults chunk = np.empty(length, dtype=dtype) chunk[:] = dflt obj.append(chunk) obj.flush() dtype = np.dtype(dtype) if type(shape) in _inttypes + (float, ): shape = (int(shape), ) else: shape = tuple(shape) if len(shape) > 1: # Multidimensional shape. # The atom will have shape[1:] dims (+ the dtype dims). dtype = np.dtype((dtype.base, shape[1:] + dtype.shape)) length = shape[0] # Create the container expectedlen = kwargs.pop("expectedlen", length) if dtype.kind == "V" and dtype.shape == (): list_ca = [] # force carrays to live in memory base_rootdir = kwargs.pop('rootdir', None) for name, col_dype in dtype.descr: dflt = np.zeros((), dtype=col_dype) ca = bcolz.carray([], dtype=col_dype, dflt=dflt, expectedlen=expectedlen, **kwargs) fill_helper(ca, dtype=ca.dtype, length=length) list_ca.append(ca) # bring rootdir back, ctable should live either on-disk or in-memory kwargs['rootdir'] = base_rootdir obj = bcolz.ctable(list_ca, names=dtype.names, **kwargs) else: obj = bcolz.carray([], dtype=dtype, dflt=dflt, expectedlen=expectedlen, **kwargs) fill_helper(obj, dtype=dtype, length=length) return obj
for ii in img_range: print('%d / %d' % (ii, img_range.shape[0])) llh_rpy = pva_interp(img_times[ii]) lon_lat_h = llh_rpy[0:3] c_n_v = nu.rpy_to_cnb(*llh_rpy[3:]) feat_df, desc = aie.extract_features(images[ii], lon_lat_h, c_n_v) center_wgs = tf.project_center(lon_lat_h, c_n_v).flatten() df_path = 'feat/df/feat_%d.hdf' % ii desc_path = 'feat/desc/desc_%d' % ii if feat_df is None: feat_meta.loc[ii] = [ 0, center_wgs[0], center_wgs[1], df_path, desc_path ] else: print("%d :: %d Feat" % (ii, desc.shape[0])) feat_meta.loc[ii] = [ desc.shape[0], center_wgs[0], center_wgs[1], df_path, desc_path ] feat_df.to_hdf(os.path.join(out_path, df_path), 'feat_df', mode='w', format='table', complib='zlib', complevel=7) bcolz.carray(desc.astype(np.float32), rootdir=os.path.join(out_path, desc_path), mode='w').flush() feat_meta.to_hdf(os.path.join(out_path, 'feat_meta.hdf'), key='feat_meta') flight.close()
''' import sys import bcolz import pickle import numpy as np glove_path = sys.argv[1] # 'C:\\Users\\Jakob\\Downloads\\glove.840B.300d' size = int(sys.argv[2]) # 840 dim = int(sys.argv[3]) # 300 words = [] idx = 0 word2idx = {} vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/{size}B.{dim}.dat', mode='w') with open(f'{glove_path}/glove.{size}B.{dim}d.txt', 'rb') as f: for l in f: line = l.split() word = line[0] words.append(word) word2idx[word] = idx idx += 1 vect = np.array(line[1:]).astype(np.float) vectors.append(vect) vectors = bcolz.carray(vectors[1:].reshape((-1, dim)), rootdir=f'{glove_path}/{size}B.{dim}.dat', mode='w')
from time import time import numpy as np import bcolz N = 1e8 # a = np.arange(N, dtype='f8') a = np.random.randint(0, 10, N).astype('bool') t0 = time() sa = a.sum() print("Time sum() numpy --> %.3f" % (time() - t0)) t0 = time() ac = bcolz.carray(a, cparams=bcolz.cparams(9)) print("Time carray conv --> %.3f" % (time() - t0)) print("ac-->", repr(ac)) t0 = time() sac = ac.sum() #sac = ac.sum(dtype=np.dtype('i8')) print("Time sum() carray --> %.3f" % (time() - t0)) # t0 = time() # sac = sum(i for i in ac) # print "Time sum() carray (iter) --> %.3f" % (time()-t0) print("sa, sac-->", sa, sac, type(sa), type(sac)) assert (sa == sac)
def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, **kwargs): """Perform the evaluation in blocks.""" # Compute the optimal block size (in elements) # The next is based on experiments with bench/ctable-query.py # and the 'movielens-bench' repository if vm == "numexpr": bsize = 2**24 else: bsize = 2**22 bsize //= typesize # Evaluation seems more efficient if block size is a power of 2 bsize = 2 ** (int(math.log(bsize, 2))) if vlen < 100*1000: bsize //= 8 elif vlen < 1000*1000: bsize //= 4 elif vlen < 10*1000*1000: bsize //= 2 # Protection against too large atomsizes if bsize == 0: bsize = 1 vars_ = {} # Get temporaries for vars maxndims = 0 for name in vars: var = vars[name] if hasattr(var, "__len__"): ndims = len(var.shape) + len(var.dtype.shape) if ndims > maxndims: maxndims = ndims if len(var) > bsize and hasattr(var, "_getrange"): vars_[name] = np.empty(bsize, dtype=var.dtype) for i in xrange(0, vlen, bsize): # Get buffers for vars for name in vars: var = vars[name] if hasattr(var, "__len__") and len(var) > bsize: if hasattr(var, "_getrange"): if i+bsize < vlen: var._getrange(i, bsize, vars_[name]) else: vars_[name] = var[i:] else: vars_[name] = var[i:i+bsize] else: if hasattr(var, "__getitem__"): vars_[name] = var[:] else: vars_[name] = var # Perform the evaluation for this block if vm == "python": res_block = _eval(expression, vars_) else: try: res_block = bcolz.numexpr.evaluate(expression, local_dict=vars_) except ValueError: # numexpr cannot handle this. Fall back to a pure "python" VM. return _eval_blocks( expression, vars, vlen, typesize, "python", out_flavor, **kwargs) if i == 0: # Detection of reduction operations scalar = False dim_reduction = False if len(res_block.shape) == 0: scalar = True result = res_block continue elif len(res_block.shape) < maxndims: dim_reduction = True result = res_block continue # Get a decent default for expectedlen if out_flavor == "carray": nrows = kwargs.pop('expectedlen', vlen) result = bcolz.carray(res_block, expectedlen=nrows, **kwargs) else: out_shape = list(res_block.shape) out_shape[0] = vlen result = np.empty(out_shape, dtype=res_block.dtype) result[:bsize] = res_block else: if scalar or dim_reduction: result += res_block elif out_flavor == "carray": result.append(res_block) else: result[i:i+bsize] = res_block if isinstance(result, bcolz.carray): result.flush() if scalar: return result[()] return result
def test_load_array(tempdir): rootdir = tempdir.path bcolz.carray(np.arange(0, 5), mode='w', rootdir=rootdir) array = core.load_array(rootdir) np.testing.assert_equal(array, [0, 1, 2, 3, 4])
def get_val_pair(path, name): rootdir = os.path.join(path, name) carray = bcolz.carray(rootdir=rootdir, mode='r') np_path = os.path.join(path, '{}_list.npy'.format(name)) issame = np.load(np_path) return carray, issame
This script processes and generates GloVe embeddings ''' # coding: utf-8 import pickle from preprocess import Vocabulary import numpy as np import json from scipy import misc import bcolz words = [] idx = 0 word2idx = {} vectors = bcolz.carray(np.zeros(1), rootdir='glove.6B/6B.300.dat', mode='w') with open('glove.6B/glove.6B.300d.txt', 'rb') as f: for l in f: line = l.decode().split() word = line[0] words.append(word) word2idx[word] = idx idx += 1 vect = np.array(line[1:]).astype(np.float) vectors.append(vect) vectors = bcolz.carray(vectors[1:].reshape((400000, 300)), rootdir='glove.6B/6B.300.dat', mode='w') vectors.flush()
def fromiter(iterable, dtype, count, **kwargs): """ fromiter(iterable, dtype, count, **kwargs) Create a carray/ctable from an `iterable` object. Parameters ---------- iterable : iterable object An iterable object providing data for the carray. dtype : numpy.dtype instance Specifies the type of the outcome object. count : int The number of items to read from iterable. If set to -1, means that the iterable will be used until exhaustion (not recommended, see note below). kwargs : list of parameters or dictionary Any parameter supported by the carray/ctable constructors. Returns ------- out : a carray/ctable object Notes ----- Please specify `count` to both improve performance and to save memory. It allows `fromiter` to avoid looping the iterable twice (which is slooow). It avoids memory leaks to happen too (which can be important for large iterables). """ # Check for a true iterable if not hasattr(iterable, "next"): iterable = iter(iterable) # Try to guess the final length expected = count if count == -1: # Try to guess the size of the iterable length if hasattr(iterable, "__length_hint__"): count = iterable.__length_hint__() expected = count # First, create the container expectedlen = kwargs.pop("expectedlen", expected) dtype = np.dtype(dtype) if dtype.kind == "V": # A ctable obj = bcolz.ctable(np.array([], dtype=dtype), expectedlen=expectedlen, **kwargs) chunklen = sum(obj.cols[name].chunklen for name in obj.names) // len(obj.names) else: # A carray obj = bcolz.carray(np.array([], dtype=dtype), expectedlen=expectedlen, **kwargs) chunklen = obj.chunklen # Then fill it while True: chunk = np.fromiter(it.islice(iterable, chunklen), dtype=dtype) if len(chunk) == 0: # Iterable has been exhausted break obj.append(chunk) obj.flush() return obj
log = logging.getLogger(__name__) log.debug("fine tune all layers") log.debug("using all_model_weight_path :" + model_path) log.debug("using test_result :" + test_result) log.debug("using loss_history_csv_name :" + loss_history_csv_name) train_name = basedir + '/pp_train_data' valid_name = basedir + '/pp_valid_data' test_name = basedir + '/pp_test_data' temp_dir = "/tmp/" ## load original bcolz data from disk # read from disk and check size valid_data = bcolz.carray(rootdir=valid_name + '_data.bclz', mode='r') test_data = bcolz.carray(rootdir=test_name + '_data.bclz', mode='r') train_data = bcolz.carray(rootdir=train_name + '_data.bclz', mode='r') valid_labels = bcolz.carray(rootdir=valid_name + '_labels.bclz', mode='r') test_labels = bcolz.carray(rootdir=test_name + '_labels.bclz', mode='r') train_labels = bcolz.carray(rootdir=train_name + '_labels.bclz', mode='r') log.debug("loading original data from disk") log.debug(valid_data.shape) log.debug(test_data.shape) log.debug(train_data.shape) log.debug(valid_labels.shape) log.debug(test_labels.shape) log.debug(train_labels.shape)
def arange(start=None, stop=None, step=None, dtype=None, **kwargs): """ arange([start,] stop[, step,], dtype=None, **kwargs) Return evenly spaced values within a given interval. Values are generated within the half-open interval ``[start, stop)`` (in other words, the interval including `start` but excluding `stop`). For integer arguments the function is equivalent to the Python built-in `range <http://docs.python.org/lib/built-in-funcs.html>`_ function, but returns a carray rather than a list. Parameters ---------- start : number, optional Start of interval. The interval includes this value. The default start value is 0. stop : number End of interval. The interval does not include this value. step : number, optional Spacing between values. For any output `out`, this is the distance between two adjacent values, ``out[i+1] - out[i]``. The default step size is 1. If `step` is specified, `start` must also be given. dtype : dtype The type of the output array. If `dtype` is not given, infer the data type from the other input arguments. kwargs : list of parameters or dictionary Any parameter supported by the carray constructor. Returns ------- out : carray Bcolz object made of evenly spaced values. For floating point arguments, the length of the result is ``ceil((stop - start)/step)``. Because of floating point overflow, this rule may result in the last element of `out` being greater than `stop`. """ # Check start, stop, step values if (start, stop) == (None, None): raise ValueError("You must pass a `stop` value at least.") elif stop is None: start, stop = 0, start elif start is None: start, stop = 0, stop if step is None: step = 1 # Guess the dtype if dtype is None: if type(stop) in _inttypes: dtype = np.dtype(np.int_) dtype = np.dtype(dtype) stop = int(stop) # Create the container expectedlen = kwargs.pop("expectedlen", stop) if dtype.kind == "V": raise ValueError("arange does not support ctables yet.") else: obj = bcolz.carray(np.array([], dtype=dtype), expectedlen=expectedlen, **kwargs) chunklen = obj.chunklen # Then fill it incr = chunklen * step # the increment for each chunk incr += step - (incr % step) # make it match step boundary bstart, bstop = start, start + incr while bstart < stop: if bstop > stop: bstop = stop chunk = np.arange(bstart, bstop, step, dtype=dtype) obj.append(chunk) bstart = bstop bstop += incr obj.flush() return obj
def test_generator(): """Simple function to test return behavior of generator code above. This runs with and without merged model version. df_train: object_id imgpath target orig label 7 1518 /tmp/path/to/imgs/518/01/dog_1518.jpg 1 data/train/dogs/dog.1518.jpg dog 1113 1662 /tmp/path/to/imgs/662/01/cat_1662.jpg 0 data/train/cats/cat.1662.jpg cat 980 1409 /tmp/path/to/imgs/409/01/dog_1409.jpg 1 data/train/dogs/dog.1409.jpg dog 1615 1813 /tmp/path/to/imgs/813/01/cat_1813.jpg 0 data/train/cats/cat.1813.jpg cat 1029 1760 /tmp/path/to/imgs/760/01/cat_1760.jpg 0 data/train/cats/cat.1760.jpg cat df_valid: object_id imgpath target orig label 787 7747 /tmp/path/to/imgs/747/07/cat_7747.jpg 0 data/validation/cats/cat.7747.jpg cat 165 7563 /tmp/path/to/imgs/563/07/dog_7563.jpg 1 data/validation/dogs/dog.7563.jpg dog 749 7517 /tmp/path/to/imgs/517/07/cat_7517.jpg 0 data/validation/cats/cat.7517.jpg cat 458 7742 /tmp/path/to/imgs/742/07/cat_7742.jpg 0 data/validation/cats/cat.7742.jpg cat 225 7479 /tmp/path/to/imgs/479/07/dog_7479.jpg 1 data/validation/dogs/dog.7479.jpg dog """ pd.np.set_printoptions(linewidth=150) df_train, df_valid = get_demo_data() img_width, img_height = 150, 150 batch_size = 64 target_size = (img_width, img_height) print("\nTest basic generator.\n") for df in (df_train, df_valid): i = 0 for X, Y in generator_from_df(df, batch_size, target_size, features=None): print(X[:3, :3, 0]) print(Y[:3]) i += 1 if i > 1: break # Create random array for bcolz test. # # In the end, this test does not use bcolz. # But, if it did, here are some hints to get you there. print("\nTest merged generator.\n") nfeatures = 74 # features_train = pd.np.random.randn(df_train.shape[0], nfeatures) # features_valid = pd.np.random.randn(df_valid.shape[0], nfeatures) # Make a 2D array, where each row is filled with the values of its # index, which will be very convenient for testing the merged # model generator. # [[0, 0, 0, ...], # [1, 1, 1, ...], # [2, 2, 2, ...], # ... # ] features_train = np.repeat(np.arange(df_train.shape[0], dtype=float).reshape((-1, 1)), nfeatures, axis=1) features_valid = np.repeat(np.arange(df_valid.shape[0], dtype=float).reshape((-1, 1)), nfeatures, axis=1) # Add a litle noise in [0, 1] just to pretend we have "real" data. features_train += np.random.rand(*features_train.shape) features_valid += np.random.rand(*features_valid.shape) fname_train = "mm_features_train_bc" if not os.path.exists(fname_train): c = bcolz.carray(features_train, rootdir=fname_train, mode='w') c.flush() fname_valid = "mm_features_valid_bc" if not os.path.exists(fname_valid): c = bcolz.carray(features_valid, rootdir=fname_valid, mode='w') c.flush() # Big assumption here: each row of a features matrix corresponds # exactly with the image represented by the row of the associated # train or valid df. *YOU* will have to ensure this in your own # code. This is only demo code! for df, fname in ((df_train, fname_train), (df_valid, fname_valid)): nbatches = df.shape[0] / float(batch_size) for i, ((X, features), Y) in enumerate( generator_from_df(df, batch_size, target_size, features=fname, debug_merged=True)): if i == 0: print(X[:3, :3, 0]) print(features[:3, :5]) print(Y[:3]) else: if (i + 1) % 20 == 0: print("%d / %d" % (i + i, nbatches), end=', ') sys.stdout.flush() # Keras automatically breaks out of the infinite "while 1" # loop in the generator_from_df(). For this test, we need # to break manually. if i >= nbatches: break print("\nSuccessful (I think...) test of multithreaded read of bcolz!") print("Note that for this test, all of the above X2 rows should"\ "have the same int() values within a row.")
import bcolz import numpy as np import pickle import pandas as pd import json ''' This file creates the matrix that is needed to convert the words in the data set to word embedding vectors. The word embedder used is GloVe. ''' glove_path = "glove_6B" words = [] idx = 0 word2idx = {} vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/6B.50.dat', mode='w') with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f: for l in f: line = l.decode().split() word = line[0] words.append(word) word2idx[word] = idx idx += 1 vect = np.array(line[1:]).astype(np.float) vectors.append(vect) vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'{glove_path}/6B.50.dat', mode='w') vectors.flush() pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb')) pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb'))
def get_val_pair(path, name): carray = bcolz.carray(rootdir=os.path.join(path, name), mode='r') issame = np.load('{}/{}_list.npy'.format(path, name)) return carray, issame
def save_array(fname, arr): " save np matrix or array" c = bcolz.carray(arr, rootdir=fname, mode='w') c.flush()
def _write_internal(self, iterator, assets): """ Internal implementation of write. `iterator` should be an iterator yielding pairs of (asset, ctable). """ total_rows = 0 first_row = {} last_row = {} calendar_offset = {} # Maps column name -> output carray. columns = { k: carray(array([], dtype=uint32)) for k in US_EQUITY_PRICING_BCOLZ_COLUMNS } earliest_date = None sessions = self._calendar.sessions_in_range(self._start_session, self._end_session) if assets is not None: @apply def iterator(iterator=iterator, assets=set(assets)): for asset_id, table in iterator: if asset_id not in assets: raise ValueError('unknown asset id %r' % asset_id) yield asset_id, table count = 0 for asset_id, table in iterator: nrows = len(table) for column_name in columns: if column_name == 'id': # We know what the content of this column is, so don't # bother reading it. columns['id'].append( full((nrows, ), asset_id, dtype='uint32'), ) continue columns[column_name].append(table[column_name]) if earliest_date is None: earliest_date = table["day"][0] else: earliest_date = min(earliest_date, table["day"][0]) # Bcolz doesn't support ints as keys in `attrs`, so convert # assets to strings for use as attr keys. asset_key = str(asset_id) # Calculate the index into the array of the first and last row # for this asset. This allows us to efficiently load single # assets when querying the data back out of the table. first_row[asset_key] = total_rows last_row[asset_key] = total_rows + nrows - 1 total_rows += nrows table_day_to_session = compose( self._calendar.minute_to_session_label, partial(Timestamp, unit='s', tz='UTC'), ) asset_first_day = table_day_to_session(table['day'][0]) asset_last_day = table_day_to_session(table['day'][-1]) asset_sessions = sessions[sessions.slice_indexer( asset_first_day, asset_last_day)] assert len(table) == len(asset_sessions), ( 'Got {} rows for daily bars table with first day={}, last ' 'day={}, expected {} rows.\n' 'Missing sessions: {}\n' 'Extra sessions: {}'.format( len(table), asset_first_day.date(), asset_last_day.date(), len(asset_sessions), asset_sessions.difference( to_datetime( np.array(table['day']), unit='s', utc=True, )).tolist(), to_datetime( np.array(table['day']), unit='s', utc=True, ).difference(asset_sessions).tolist(), )) # Calculate the number of trading days between the first date # in the stored data and the first date of **this** asset. This # offset used for output alignment by the reader. calendar_offset[asset_key] = sessions.get_loc(asset_first_day) count = count + 1 if count == 0: return # This writes the table to disk. full_table = ctable( columns=[ columns[colname] for colname in US_EQUITY_PRICING_BCOLZ_COLUMNS ], names=US_EQUITY_PRICING_BCOLZ_COLUMNS, rootdir=self._filename, mode='w', ) full_table.attrs['first_trading_day'] = (earliest_date if earliest_date is not None else iNaT) full_table.attrs['first_row'] = first_row full_table.attrs['last_row'] = last_row full_table.attrs['calendar_offset'] = calendar_offset full_table.attrs['calendar_name'] = self._calendar.name full_table.attrs['start_session_ns'] = self._start_session.value full_table.attrs['end_session_ns'] = self._end_session.value full_table.flush() return full_table
def _write_internal(self, iterator, assets): """ Internal implementation of write. `iterator` should be an iterator yielding pairs of (asset, ctable). """ total_rows = 0 first_row = {} last_row = {} calendar_offset = {} # Maps column name -> output carray adding int64 for the greeks columns = { k: carray( array( [], dtype=(uint32_dtype if k not in GREEKS else int64_dtype))) for k in OPTION_PRICING_BCOLZ_COLUMNS } earliest_date = None sessions = self._calendar.sessions_in_range(self._start_session, self._end_session) if assets is not None: @apply def iterator(iterator=iterator, assets=set(assets)): for asset_id, table in iterator: if asset_id not in assets: raise ValueError("unknown asset id %r" % asset_id) yield asset_id, table for asset_id, table in iterator: logger.info(f"Writing asset id {asset_id} to disk") nrows = len(table) for column_name in columns: if column_name == "id": # We know what the content of this column is, so don't # bother reading it. columns["id"].append( full((nrows, ), asset_id, dtype="uint32")) continue columns[column_name].append(table[column_name]) if earliest_date is None: earliest_date = table["day"][0] else: earliest_date = min(earliest_date, table["day"][0]) # Bcolz doesn't support ints as keys in `attrs`, so convert # assets to strings for use as attr keys. asset_key = str(asset_id) # Calculate the index into the array of the first and last row # for this asset. This allows us to efficiently load single # assets when querying the data back out of the table. first_row[asset_key] = total_rows last_row[asset_key] = total_rows + nrows - 1 total_rows += nrows table_day_to_session = compose( self._calendar.minute_to_session_label, partial(Timestamp, unit="s", tz="UTC"), ) asset_first_day = table_day_to_session(table["day"][0]) asset_last_day = table_day_to_session(table["day"][-1]) asset_sessions = sessions[sessions.slice_indexer( asset_first_day, asset_last_day)] assert len(table) == len(asset_sessions), ( "Got {} rows for daily bars table with first day={}, last " "day={}, expected {} rows.\n" "Missing sessions: {}\n" "Extra sessions: {}".format( len(table), asset_first_day.date(), asset_last_day.date(), len(asset_sessions), asset_sessions.difference( to_datetime(np.array(table["day"]), unit="s", utc=True)).tolist(), to_datetime(np.array(table["day"]), unit="s", utc=True).difference(asset_sessions).tolist(), )) # Calculate the number of trading days between the first date # in the stored data and the first date of **this** asset. This # offset used for output alignment by the reader. calendar_offset[asset_key] = sessions.get_loc(asset_first_day) logger.info("Writing complete table to disk") # This writes the table to disk. full_table = ctable( columns=[ columns[colname] for colname in OPTION_PRICING_BCOLZ_COLUMNS ], names=OPTION_PRICING_BCOLZ_COLUMNS, rootdir=self._filename, mode="w", ) full_table.attrs["first_trading_day"] = (earliest_date if earliest_date is not None else iNaT) full_table.attrs["first_row"] = first_row full_table.attrs["last_row"] = last_row full_table.attrs["calendar_offset"] = calendar_offset full_table.attrs["calendar_name"] = self._calendar.name full_table.attrs["start_session_ns"] = self._start_session.value full_table.attrs["end_session_ns"] = self._end_session.value full_table.flush() return full_table
def save_array(f, arr): try_mkdir(os.path.dirname(f)) c = bcolz.carray(arr, rootdir=f, mode='w') c.flush()
def get_val_pair(path, name): carray = bcolz.carray(rootdir=path / name, mode="r") issame = np.load(path / "{}_list.npy".format(name)) return carray, issame
def save_array(fname, arr): c = bcolz.carray(arr, rootdir=fname, mode='w') c.flush()
def convert_numpy_to_bcolz_carray(x, **kwargs): return carray(x, **keyfilter(keywords.__contains__, kwargs))
def save_array(fname, arr): bcolz.carray(arr, rootdir=fname, mode='w')
import bcolz import os import torch import numpy as np from PIL import Image path = "/ssd-data/lmd/eval_dbs" names = ["agedb_30"] for name in names: carray = bcolz.carray(rootdir=os.path.join(path, name), mode="r") print(carray.shape) print(carray[-1].transpose(1, 2, 0).shape) print((carray[-1].transpose(1, 2, 0))[55:65, 55:65]) img = Image.fromarray((carray[-1].transpose(1, 2, 0).astype(np.float32) * 255).astype(np.uint8)) img.save("/data2/lmd_jdq/cfp-fp/%d.jpg" % 0) for i in range(1, 20): print(np.sum(carray[-i] - carray[-1]))
#Data loader import cython import numpy as np import bcolz import pickle import torch import torch.nn as nn import matplotlib.pyplot as plt from torch.autograd import Variable #Dataloader words = [] idx = 0 word2idx = {} vectors = bcolz.carray(np.zeros(1), rootdir=f'/Users/nilslager/Desktop/gitit.50.dat', mode='w') #Open up GloVe embeddings and create vectors with open(f'/Users/nilslager/Desktop/wv_50d_gitit.txt', 'rb') as f: for l in f: line = l.decode().split() word = line[0] words.append(word) word2idx[word] = idx idx += 1 vect = np.array(line[1:]).astype(np.float) vectors.append(vect) #Construct pickle files vectors = bcolz.carray(vectors[1:].reshape((400000, 50)),
def save_array(data, fname): print("Saving image dataset at the location " + str(fname) + ".") c = bcolz.carray(data, rootdir=fname, mode='w') c.flush()
def save_array(fname, arr): c = bz.carray(arr, rootdir=fname, mode=w) c.flush()