Exemple #1
0
 def __setstate__(self, d):
     self.rootdir = d['rootdir']
     self.columns = d['columns']
     self.blocks = dict((col, bcolz.carray(rootdir=os.path.join(self.rootdir, '%s.bcolz' % col)))
             for col in self.columns)
     self.index = bcolz.carray(rootdir=os.path.join(self.rootdir, 'index.bcolz'))
     self._explicitly_given_path = True
Exemple #2
0
    def factorize_groupby_cols(self, groupby_cols):
        """

        :type self: ctable
        """
        # first check if the factorized arrays already exist
        # unless we need to refresh the cache
        factor_list = []
        values_list = []

        # factorize the groupby columns
        for col in groupby_cols:

            if self.cache_valid(col):
                col_rootdir = self[col].rootdir
                col_factor_rootdir = col_rootdir + '.factor'
                col_values_rootdir = col_rootdir + '.values'
                col_factor_carray = \
                    bcolz.carray(rootdir=col_factor_rootdir, mode='r')
                col_values_carray = \
                    bcolz.carray(rootdir=col_values_rootdir, mode='r')
            else:
                col_factor_carray, values = ctable_ext.factorize(self[col])
                col_values_carray = \
                    bcolz.carray(values.values(), dtype=self[col].dtype)

            factor_list.append(col_factor_carray)
            values_list.append(col_values_carray)

        return factor_list, values_list
Exemple #3
0
def resource_bcolz(uri, dshape=None, expected_dshape=None, **kwargs):
    if os.path.exists(uri):
        try:
            return ctable(rootdir=uri)
        except IOError:  # __rootdirs__ doesn't exist because we aren't a ctable
            return carray(rootdir=uri)
    else:
        if not dshape:
            raise ValueError("Must specify either existing bcolz directory or"
                             " valid datashape")
        dshape = datashape.dshape(dshape)

        dt = datashape.to_numpy_dtype(dshape)
        shape_tail = tuple(map(int, dshape.shape[1:]))  # tail of shape
        if dshape.shape[0] == datashape.var:
            shape = (0,) + shape_tail
        else:
            shape = (int(dshape.shape[0]),) + shape_tail

        x = np.empty(shape=shape, dtype=dt)

        kwargs = keyfilter(keywords.__contains__, kwargs)
        expectedlen = kwargs.pop('expectedlen',
                                 int(expected_dshape[0])
                                 if expected_dshape is not None and
                                 isinstance(expected_dshape[0], datashape.Fixed)
                                 else None)

        if datashape.predicates.isrecord(dshape.measure):
            return ctable(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
        else:
            return carray(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
Exemple #4
0
def _from_carray(path, format_categories=None, format_codes=None, format_values=None):
    meta = json.load(open(os.path.join(path, 'meta'), 'r'))

    if meta['type'] == 'category':
        if format_categories in ['npz', 'npy']:
            filename = os.path.join(path, 'categories.%s' % format_categories)
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                categories_values = numpy.load(filename, mmap_mode='r+') # TODO npz not memmap?
                if format_categories == 'npz':
                    categories_values = categories_values['arr_0']
        elif format_categories == 'pickle':
            filename = os.path.join(path, 'categories.pickle')
            with log.timedlogger("reading [%s] %s" % (meta['name'], filename)):
                categories_values = pickle.load(open(filename, 'rb'))
        elif format_categories == 'bcolz':
            rootdir = os.path.join(path, 'categories.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                categories_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r')
                # categories_values = bcolz.carray(rootdir=rootdir, mode='r')[:]
        else:
            raise NotImplementedError("uh oh %s" % (meta['type'],))

        if format_codes == 'bcolz':
            rootdir = os.path.join(path, 'codes.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                codes_values = bcolz.carray(rootdir=rootdir, mode='r')[:] # , categories=categories_values)
                # codes_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') # , categories=categories_values)
        elif format_codes == 'npy':
            filename = os.path.join(path, 'codes.npy')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                codes_values = numpy.load(filename, mmap_mode='r+')
        else:
            raise Exception("unknown format_codes type %s" % (format_codes,))

        with log.timedlogger("FastCat construction"):
            s = FastCat(codes_values, categories_values)
    else:
        if format_values == 'bcolz':
            rootdir = os.path.join(path, 'values.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                # values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r')
                s = bcolz.carray(rootdir=rootdir, mode='r')[:]
        elif format_values == 'npy':
            filename = os.path.join(path, 'values.npy')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                s = numpy.load(filename, mmap_mode='r+')
        elif format_values == 'pickle':
            filename = os.path.join(path, 'values.pickle')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                s = pickle.load(open(filename, 'rb'))
        # with log.timedlogger("FastSeries construction"):
        #     index = pandas.Index(numpy.arange(len(values)), copy=False)
        #     values = SingleBlockManager(values, index, fastpath=True)
        #     s = pandas.Series(data=values, fastpath=True, copy=False, dtype=meta['type'])
        # s = values # [:]
    # logging.warning('Constructing categorical for %s' % meta['name'])
    # s = pandas.Categorical.from_codes(codes_values, categories_values, name=meta['name'])
    return meta, s # codes_values, categories_values
Exemple #5
0
def build_carray(array, rootdir):
    """ Used in ctable.__reduce__

    Pickling functions can't be in pyx files.  Putting this tiny helper
    function here instead.
    """
    from bcolz import carray
    if rootdir:
        return carray(rootdir=rootdir)
    else:
        return carray(array)
Exemple #6
0
def to_dict_of_blocks(d, rootdir):
    """ deprecated. for pure numpy things like {'X_train': X_train, 'X_test': X_test} """
    if os.path.exists(rootdir):
        _move_and_remove_nonblocking(rootdir)
    _mkdir(rootdir)
    meta = {'keys': list(d.keys())}
    json.dump(meta, open(os.path.join(rootdir, 'meta'), 'w'))
    for i, k in enumerate(meta['keys']):
        filename = os.path.join(rootdir, str(i))
        with log.timedlogger('writing {} ({}.shape = {})'.format(filename, k, d[k].shape)):
            bcolz.carray(d[k], rootdir=filename)
Exemple #7
0
    def handle_frame(self, i, frame):
        if i == 0:
            self.frames = bcolz.carray(np.zeros((0,) + frame.coords.shape, dtype="float32"),
                                 rootdir=os.path.join(self.rootdir, "coords"),
                                 mode='w')
            self.frames.attrs['timestamp'] = self.timestamp
            self.boxes = bcolz.carray(np.zeros((0,) + frame.box.shape, dtype="float32"),
                                 rootdir=os.path.join(self.rootdir, "boxes"),
                                 mode='w')
            self.times = []

        self.frames.append(frame.coords)
        self.boxes.append(frame.box)
        self.times.append(frame.time)
Exemple #8
0
    def unique(self, col_or_col_list):
        """
        Return a list of unique values of a column or a list of lists of column list

        :param col_or_col_list: a column or a list of columns
        :return:
        """

        if isinstance(col_or_col_list, list):
            col_is_list = True
            col_list = col_or_col_list
        else:
            col_is_list = False
            col_list = [col_or_col_list]

        output = []

        for col in col_list:

            if self.cache_valid(col):
                # retrieve values from existing disk-based factorization
                col_values_rootdir = self[col].rootdir + '.values'
                carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r')
                values = list(carray_values)
            else:
                # factorize on-the-fly
                _, values = ctable_ext.factorize(self[col])
                values = values.values()

            output.append(values)

        if not col_is_list:
            output = output[0]

        return output
Exemple #9
0
def open(rootdir, mode='a'):
    """
    open(rootdir, mode='a')

    Open a disk-based carray/ctable.

    Parameters
    ----------
    rootdir : pathname (string)
        The directory hosting the carray/ctable object.
    mode : the open mode (string)
        Specifies the mode in which the object is opened.  The supported
        values are:

          * 'r' for read-only
          * 'w' for emptying the previous underlying data
          * 'a' for allowing read/write on top of existing data

    Returns
    -------
    out : a carray/ctable object or None (if not objects are found)

    """
    # First try with a carray
    obj = None
    try:
        obj = bcolz.carray(rootdir=rootdir, mode=mode)
    except IOError:
        # Not a carray.  Now with a ctable
        try:
            obj = bcolz.ctable(rootdir=rootdir, mode=mode)
        except IOError:
            # Not a ctable
            pass
    return obj
Exemple #10
0
    def setChannelData(self, channelName, data, compression=False):
        """Modifies data of channel

        Parameters
        ----------------
        channelName : str
            channel name
        data : numpy array
            channel data
        compression : bool or str
            trigger for data compression
        """
        if compression and CompressionPossible:
            if not isinstance(compression, str):
                if isinstance(compression, int):
                    comp = compression
                else:
                    comp = self._compression_level
                temp = carray(data,
                              cparams=cparams(clevel=comp),
                              expectedlen=int(getsizeof(data) / 10))
            else:
                temp = compressed_data()
                temp.compression(data)
            self._setChannel(channelName, temp, field=dataField)
        else:
            self._setChannel(channelName, data, field=dataField)
Exemple #11
0
 def test_create_unsafe_carray_with_unsafe_data(self):
     """ We introduce a safe keyword arg which removes dtype checking.
     We don't want this to interfere with creation.
     """
     b = bcolz.carray([1, 2, 3], dtype='i4', safe=False)
     self.assertEqual(b.safe, False)
     self.assertEqual(b[0], 1)
Exemple #12
0
def open(rootdir, mode='a'):
    """
    open(rootdir, mode='a')

    Open a disk-based carray/ctable.

    Parameters
    ----------
    rootdir : pathname (string)
        The directory hosting the carray/ctable object.
    mode : the open mode (string)
        Specifies the mode in which the object is opened.  The supported
        values are:

          * 'r' for read-only
          * 'w' for emptying the previous underlying data
          * 'a' for allowing read/write on top of existing data

    Returns
    -------
    out : a carray/ctable object or IOError (if not objects are found)

    """
    # First try with a carray
    rootsfile = os.path.join(rootdir, ROOTDIRS)
    if os.path.exists(rootsfile):
        return bcolz.ctable(rootdir=rootdir, mode=mode)
    else:
        return bcolz.carray(rootdir=rootdir, mode=mode)
Exemple #13
0
def fill(shape, dflt=None, dtype=np.float, **kwargs):
    """
    fill(shape, dtype=float, dflt=None, **kwargs)

    Return a new carray object of given shape and type, filled with `dflt`.

    Parameters
    ----------
    shape : int
        Shape of the new array, e.g., ``(2,3)``.
    dflt : Python or NumPy scalar
        The value to be used during the filling process.  If None, values are
        filled with zeros.  Also, the resulting carray will have this value as
        its `dflt` value.
    dtype : data-type, optional
        The desired data-type for the array, e.g., `numpy.int8`.  Default is
        `numpy.float64`.
    kwargs : list of parameters or dictionary
        Any parameter supported by the carray constructor.

    Returns
    -------
    out : carray
        Array filled with `dflt` values with the given shape and dtype.

    See Also
    --------
    ones, zeros

    """

    dtype = np.dtype(dtype)
    if type(shape) in _inttypes + (float,):
        shape = (int(shape),)
    else:
        shape = tuple(shape)
        if len(shape) > 1:
            # Multidimensional shape.
            # The atom will have shape[1:] dims (+ the dtype dims).
            dtype = np.dtype((dtype.base, shape[1:]+dtype.shape))
    length = shape[0]

    # Create the container
    expectedlen = kwargs.pop("expectedlen", length)
    if dtype.kind == "V" and dtype.shape == ():
        raise ValueError("fill does not support ctables objects")
    obj = bcolz.carray([], dtype=dtype, dflt=dflt, expectedlen=expectedlen,
                       **kwargs)
    chunklen = obj.chunklen

    # Then fill it
    # We need an array for the default so as to keep the atom info
    dflt = np.array(obj.dflt, dtype=dtype)
    # Making strides=(0,) below is a trick to create the array fast and
    # without memory consumption
    chunk = np.ndarray(length, dtype=dtype, buffer=dflt, strides=(0,))
    obj.append(chunk)
    obj.flush()
    return obj
Exemple #14
0
 def test00(self):
     """Testing unicode types (creation)"""
     a = np.array([[u"aŀle", u"eñe"], [u"açò", u"áèâë"]], dtype="U4")
     b = bcolz.carray(a)
     # print "b.dtype-->", b.dtype
     # print "b->", `b`
     self.assertTrue(a.dtype == b.dtype.base)
     assert_array_equal(a, b[:], "Arrays are not equal")
Exemple #15
0
 def set_type(self, dtype):
     if self.dtype != dtype:
         self.fill_nan(self.nan_value(dtype=dtype))
         carray = bcolz.carray(self.carray, dtype= np.dtype(dtype).name)  # TODO do it chunk by chunk + check data
         ct = self._table._ctable
         col_pos = self.position
         ct.delcol(self._name)
         ct.addcol(carray, name=self._name, pos=col_pos)
Exemple #16
0
def into(a, b, **kwargs):
    if isinstance(a, type):
        kwargs = keyfilter(carray_keywords.__contains__, kwargs)
        return carray(b, **kwargs)
    else:
        a.append(b)
        a.flush()
        return a
Exemple #17
0
 def test00(self):
     """Testing string types (creation)"""
     a = np.array([["ale", "ene"], ["aco", "ieie"]], dtype="S4")
     b = bcolz.carray(a)
     # print "b.dtype-->", b.dtype
     # print "b->", `b`
     self.assertTrue(a.dtype == b.dtype.base)
     assert_array_equal(a, b[:], "Arrays are not equal")
Exemple #18
0
    def __init__(self, val):
        super().__init__()

        if isinstance(val, bcolz.carray):
            self._carray = val
        elif isinstance(val, list) or isinstance(val, np.ndarray):
            self._carray = bcolz.carray(val, expectedlen=Column.DEFAULT_BLOCK_LEN)
        else:
            raise DazzleError("Invalid argument in ResultColumn.%s()" % method_name())
    def _write_internal(self, filename, calendar, iterator):
        """
        Internal implementation of write.

        `iterator` should be an iterator yielding pairs of (asset, ctable).
        """
        total_rows = 0
        first_row = {}
        last_row = {}
        calendar_offset = {}

        # Maps column name -> output carray.
        columns = {k: carray(array([], dtype=uint32)) for k in US_EQUITY_PRICING_BCOLZ_COLUMNS}

        for asset_id, table in iterator:
            nrows = len(table)
            for column_name in columns:
                if column_name == "id":
                    # We know what the content of this column is, so don't
                    # bother reading it.
                    columns["id"].append(full((nrows,), asset_id, uint32))
                    continue
                columns[column_name].append(self.to_uint32(table[column_name][:], column_name))

            # Bcolz doesn't support ints as keys in `attrs`, so convert
            # assets to strings for use as attr keys.
            asset_key = str(asset_id)

            # Calculate the index into the array of the first and last row
            # for this asset. This allows us to efficiently load single
            # assets when querying the data back out of the table.
            first_row[asset_key] = total_rows
            last_row[asset_key] = total_rows + nrows - 1
            total_rows += nrows

            # Calculate the number of trading days between the first date
            # in the stored data and the first date of **this** asset. This
            # offset used for output alignment by the reader.

            # HACK: Index with a list so that we get back an array we can pass
            # to self.to_uint32.  We could try to extract this in the loop
            # above, but that makes the logic a lot messier.
            asset_first_day = self.to_uint32(table["day"][[0]], "day")[0]
            calendar_offset[asset_key] = calendar.get_loc(Timestamp(asset_first_day, unit="s", tz="UTC"))

        # This writes the table to disk.
        full_table = ctable(
            columns=[columns[colname] for colname in US_EQUITY_PRICING_BCOLZ_COLUMNS],
            names=US_EQUITY_PRICING_BCOLZ_COLUMNS,
            rootdir=filename,
            mode="w",
        )
        full_table.attrs["first_row"] = first_row
        full_table.attrs["last_row"] = last_row
        full_table.attrs["calendar_offset"] = calendar_offset
        full_table.attrs["calendar"] = calendar.asi8.tolist()
        return full_table
Exemple #20
0
    def _open_minute_file(self, field, sid):
        sid = int(sid)

        try:
            carray = self._carrays[field][sid]
        except KeyError:
            carray = self._carrays[field][sid] = bcolz.carray(rootdir=self._get_carray_path(sid, field), mode="r")

        return carray
Exemple #21
0
 def test01(self):
     """Testing unicode types (append)"""
     a = np.ones((300, 4), dtype="U4")
     b = bcolz.carray([], dtype="U4").reshape((0, 4))
     b.append(a)
     # print "b.dtype-->", b.dtype
     # print "b->", `b`
     self.assertTrue(a.dtype == b.dtype.base)
     assert_array_equal(a, b[:], "Arrays are not equal")
Exemple #22
0
 def test_carray_record_as_object(self):
     src_data = np.empty((10,), dtype=np.dtype('u1,O'))
     src_data[:] = [(i, 's'*i) for i in range(10)]
     carr = bcolz.carray(src_data, dtype=np.dtype('O'))
     self.assertEqual(len(carr.shape), 1)
     self.assertEqual(len(src_data), carr.shape[0])
     for i in range(len(carr)):
         self.assertEqual(carr[i][0], src_data[i][0])
         self.assertEqual(carr[i][1], src_data[i][1])
Exemple #23
0
 def test03c(self):
     """Testing `__getitem()__` method with several slices (III)"""
     a = np.arange(120 * 1000).reshape((5 * 1000, 4, 3, 2))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (slice(None, None, 3), slice(1, 3, 2), slice(1, 4, 2))
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Exemple #24
0
 def test05c(self):
     """Testing `__getitem()__` method with fancy indexing (III)"""
     a = np.arange(2000).reshape((50, 40))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (slice(None), [0, 2])
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Exemple #25
0
    def test_carray_1d_source(self):
        """Testing carray of objects, 1d source"""
        src_data = ['s'*i for i in range(10)]
        carr = bcolz.carray(src_data, dtype=np.dtype('O'))

        self.assertEqual(len(carr.shape), 1)
        self.assertEqual(len(src_data), carr.shape[0])
        for i in range(len(carr)):
            self.assertEqual(carr[i], src_data[i])
            self.assertEqual(carr[i], src_data[i])
Exemple #26
0
 def test04c(self):
     """Testing `__getitem()__` method with shape reduction (III)"""
     a = np.arange(6000).reshape((50, 40, 3))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (1, slice(1, 4, 2), 2)
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Exemple #27
0
 def test00(self):
     """Testing sum()."""
     a = np.arange(1e5).reshape(10, 1e4)
     sa = a.sum()
     ac = bcolz.carray(a)
     sac = ac.sum()
     #print "numpy sum-->", sa
     #print "carray sum-->", sac
     self.assert_(sa.dtype == sac.dtype, "sum() is not working correctly.")
     self.assert_(sa == sac, "sum() is not working correctly.")
Exemple #28
0
 def _open_write(self, data=None):
     if self._bcolz is None:
         try:  # append
             self._bcolz = \
                 bcolz.carray(None,
                              rootdir=self._bcolz_dir(),
                              mode='a',
                              # bcolz conf in case mode='a' semantics change to create, otherwise innocuous
                              chunklen=self.chunklen,
                              expectedlen=self.expectedlen,
                              cparams=self.cparams)
         except:  # create
             self._bcolz = \
                 bcolz.carray(data[0:0],
                              rootdir=self._bcolz_dir(),
                              mode='w',
                              chunklen=self.chunklen,
                              expectedlen=self.expectedlen,
                              cparams=self.cparams)
Exemple #29
0
 def read_meta_and_open(self):
     """Read the meta-information and initialize structures."""
     # Get the directories of the columns
     rootsfile = os.path.join(self.rootdir, ROOTDIRS)
     with open(rootsfile, "rb") as rfile:
         data = json.loads(rfile.read())
     # JSON returns unicode (?)
     self.names = [str(name) for name in data["names"]]
     # Initialize the cols by instantiating the carrays
     for name, dir_ in data["dirs"].items():
         self._cols[str(name)] = bcolz.carray(rootdir=dir_, mode=self.mode)
Exemple #30
0
    def cache_factor(self, col_list, refresh=False):
        """
        Existing todos here are: these should be hidden helper carrays
        As in: not normal columns that you would normally see as a user

        The factor (label index) carray is as long as the original carray
        (and the rest of the table therefore)
        But the (unique) values carray is not as long (as long as the number
        of unique values)

        :param col_list:
        :param refresh:
        :return:
        """

        if not self.rootdir:
            raise TypeError('Only out-of-core ctables can have '
                            'factorization caching at the moment')

        if not isinstance(col_list, list):
            col_list = [col_list]

        for col in col_list:

            # create cache if needed
            if refresh or not self.cache_valid(col):
                col_rootdir = self[col].rootdir
                col_factor_rootdir = col_rootdir + '.factor'
                col_values_rootdir = col_rootdir + '.values'

                carray_factor = \
                    bcolz.carray([], dtype='int64', expectedlen=self.size,
                                   rootdir=col_factor_rootdir, mode='w')
                _, values = \
                    ctable_ext.factorize(self[col], labels=carray_factor)
                carray_factor.flush()

                carray_values = \
                    bcolz.carray(values.values(), dtype=self[col].dtype,
                                 rootdir=col_values_rootdir, mode='w')
                carray_values.flush()
Exemple #31
0
import numpy as np

import bcolz

N = 1e8  # the number of elements in x
clevel = 5  # the compression level
sexpr = "(x-1) < 10."  # the expression to compute
# sexpr = "((x-1) % 1000) == 0."  # the expression to compute
#sexpr = "(2*x**3+.3*y**2+z+1)<0"  # the expression to compute

cparams = bcolz.cparams(clevel)

print("Creating inputs...")

x = np.arange(N)
cx = bcolz.carray(x, cparams=cparams)
if 'y' not in sexpr:
    ct = bcolz.ctable((cx, ), names=['x'])
else:
    y = np.arange(N)
    z = np.arange(N)
    cy = bcolz.carray(y, cparams=cparams)
    cz = bcolz.carray(z, cparams=cparams)
    ct = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z'])

print("Evaluating...", sexpr)
t0 = time()
cbout = ct.eval(sexpr)
print("Time for evaluation--> %.3f" % (time() - t0, ))
print("Converting to numy arrays")
bout = cbout[:]
Exemple #32
0
    # then get the coordinates
    sx = 3 * i
    ex = 3 * (i + 1)
    xi = np.array(xyz[sx:ex])
    xyzi = np.stack([c for c in xi],
                    axis=1) / 100  # have to scale by 100 to match PDB

    # lastly convert the mask to indices
    msk_idx = np.map(np.array(list(masks[i])) == '+')[0]

    # bracket id or get "setting an array element with a sequence"
    zt = np.array([[id], seq, pssmi, xyzi, msk_idx])

    if i == 0:
        bc = bcolz.carray([zt],
                          rootdir=data_path + 'testing.bc',
                          mode='w',
                          expectedlen=len(ids))
        bc.flush()
    else:
        bc = bcolz.carray(rootdir=data_path + 'testing.bc', mode='w')
        bc.append([zt])
        bc.flush()

# %%
from pathlib import Path

home = str(Path.home())
pn_path = home + '/Downloads/casp7/casp7/testing'
# pn_path = os.curdir + '/../rgn_pytorch/data/text_sample'
dataset = ProteinNetDataset(pn_path)
trn_data = DataLoader(dataset, batch_size=32, shuffle=True)
Exemple #33
0
def save_array(data_folder, fname, arr):
    fname = os.path.join(data_folder, fname)
    print("Saving to {0} ...".format(fname))
    c = bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
Exemple #34
0
def load_dataset_face(args,
                      INPUT_SIZE=[112, 112],
                      RGB_MEAN=[0.5, 0.5, 0.5],
                      RGB_STD=[0.5, 0.5, 0.5],
                      val_datasets=[
                          'lfw', 'cfp_ff', 'cfp_fp', 'agedb_30', 'calfw',
                          'cplfw', 'vgg2_fp'
                      ]):
    train_transform = transforms.Compose([
        transforms.Resize(
            [int(128 * INPUT_SIZE[0] / 112),
             int(128 * INPUT_SIZE[0] / 112)]),  # smaller side resized
        transforms.RandomCrop([INPUT_SIZE[0], INPUT_SIZE[1]]),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=RGB_MEAN, std=RGB_STD),
    ])
    train_data = dset.ImageFolder(
        os.path.join(args.data_path, 'CASIA-maxpy-align'), train_transform)
    weights = torch.DoubleTensor(
        make_weights_for_balanced_classes(train_data.imgs,
                                          len(train_data.classes)))
    if args.distributed:
        from catalyst.data.sampler import DistributedSamplerWrapper
        train_sampler = DistributedSamplerWrapper(
            torch.utils.data.sampler.WeightedRandomSampler(
                weights, len(weights)))
    else:
        train_sampler = torch.utils.data.sampler.WeightedRandomSampler(
            weights, len(weights))
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(
            [int(128 * INPUT_SIZE[0] / 112),
             int(128 * INPUT_SIZE[0] / 112)]),
        transforms.CenterCrop([INPUT_SIZE[0], INPUT_SIZE[1]]),
        transforms.ToTensor(),
        transforms.Normalize(mean=RGB_MEAN, std=RGB_STD)
    ])
    val_loaders = []
    for name in val_datasets:
        carray = bcolz.carray(rootdir=os.path.join(args.data_path, name),
                              mode='r')
        val_data_tensor = torch.tensor(carray[:, [2, 1, 0], :, :]) * 0.5 + 0.5
        val_data = TensorsDataset(val_data_tensor, val_transform)
        val_loader = torch.utils.data.DataLoader(val_data,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 num_workers=args.workers,
                                                 pin_memory=True,
                                                 sampler=None)
        issame = np.load('{}/{}_list.npy'.format(args.data_path, name))
        val_loaders.append((name, val_loader, issame))

    return train_loader, val_loaders
Exemple #35
0
def fill(shape, dflt=None, dtype=np.float, **kwargs):
    """fill(shape, dtype=float, dflt=None, **kwargs)

    Return a new carray or ctable object of given shape and type, filled with
    `dflt`.

    Parameters
    ----------
    shape : int
        Shape of the new array, e.g., ``(2,3)``.
    dflt : Python or NumPy scalar
        The value to be used during the filling process.  If None, values are
        filled with zeros.  Also, the resulting carray will have this value as
        its `dflt` value.
    dtype : data-type, optional
        The desired data-type for the array, e.g., `numpy.int8`.  Default is
        `numpy.float64`.
    kwargs : list of parameters or dictionary
        Any parameter supported by the carray constructor.

    Returns
    -------
    out : carray or ctable
        Bcolz object filled with `dflt` values with the given shape and dtype.

    See Also
    --------
    ones, zeros

    """
    def fill_helper(obj, dtype=None, length=None):
        """Helper function to fill a carray with default values"""
        assert isinstance(obj, bcolz.carray)
        assert dtype is not None
        assert length is not None
        if type(length) is float:
            length = int(length)

        # Then fill it
        # We need an array for the default so as to keep the atom info
        dflt = np.array(obj.dflt, dtype=dtype.base)
        # Fill chunk with defaults
        chunk = np.empty(length, dtype=dtype)
        chunk[:] = dflt
        obj.append(chunk)
        obj.flush()

    dtype = np.dtype(dtype)
    if type(shape) in _inttypes + (float, ):
        shape = (int(shape), )
    else:
        shape = tuple(shape)
        if len(shape) > 1:
            # Multidimensional shape.
            # The atom will have shape[1:] dims (+ the dtype dims).
            dtype = np.dtype((dtype.base, shape[1:] + dtype.shape))
    length = shape[0]

    # Create the container
    expectedlen = kwargs.pop("expectedlen", length)
    if dtype.kind == "V" and dtype.shape == ():
        list_ca = []
        # force carrays to live in memory
        base_rootdir = kwargs.pop('rootdir', None)
        for name, col_dype in dtype.descr:
            dflt = np.zeros((), dtype=col_dype)
            ca = bcolz.carray([],
                              dtype=col_dype,
                              dflt=dflt,
                              expectedlen=expectedlen,
                              **kwargs)
            fill_helper(ca, dtype=ca.dtype, length=length)
            list_ca.append(ca)
        # bring rootdir back, ctable should live either on-disk or in-memory
        kwargs['rootdir'] = base_rootdir
        obj = bcolz.ctable(list_ca, names=dtype.names, **kwargs)
    else:
        obj = bcolz.carray([],
                           dtype=dtype,
                           dflt=dflt,
                           expectedlen=expectedlen,
                           **kwargs)
        fill_helper(obj, dtype=dtype, length=length)

    return obj
Exemple #36
0
    for ii in img_range:
        print('%d / %d' % (ii, img_range.shape[0]))
        llh_rpy = pva_interp(img_times[ii])
        lon_lat_h = llh_rpy[0:3]
        c_n_v = nu.rpy_to_cnb(*llh_rpy[3:])
        feat_df, desc = aie.extract_features(images[ii], lon_lat_h, c_n_v)
        center_wgs = tf.project_center(lon_lat_h, c_n_v).flatten()
        df_path = 'feat/df/feat_%d.hdf' % ii
        desc_path = 'feat/desc/desc_%d' % ii

        if feat_df is None:
            feat_meta.loc[ii] = [
                0, center_wgs[0], center_wgs[1], df_path, desc_path
            ]
        else:
            print("%d :: %d Feat" % (ii, desc.shape[0]))
            feat_meta.loc[ii] = [
                desc.shape[0], center_wgs[0], center_wgs[1], df_path, desc_path
            ]
            feat_df.to_hdf(os.path.join(out_path, df_path),
                           'feat_df',
                           mode='w',
                           format='table',
                           complib='zlib',
                           complevel=7)
            bcolz.carray(desc.astype(np.float32),
                         rootdir=os.path.join(out_path, desc_path),
                         mode='w').flush()

    feat_meta.to_hdf(os.path.join(out_path, 'feat_meta.hdf'), key='feat_meta')
    flight.close()
'''

import sys
import bcolz
import pickle
import numpy as np

glove_path = sys.argv[1]  # 'C:\\Users\\Jakob\\Downloads\\glove.840B.300d'
size = int(sys.argv[2])  # 840
dim = int(sys.argv[3])  # 300

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1),
                       rootdir=f'{glove_path}/{size}B.{dim}.dat',
                       mode='w')

with open(f'{glove_path}/glove.{size}B.{dim}d.txt', 'rb') as f:
    for l in f:
        line = l.split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

vectors = bcolz.carray(vectors[1:].reshape((-1, dim)),
                       rootdir=f'{glove_path}/{size}B.{dim}.dat',
                       mode='w')
Exemple #38
0
from time import time

import numpy as np

import bcolz

N = 1e8
# a = np.arange(N, dtype='f8')
a = np.random.randint(0, 10, N).astype('bool')

t0 = time()
sa = a.sum()
print("Time sum() numpy --> %.3f" % (time() - t0))

t0 = time()
ac = bcolz.carray(a, cparams=bcolz.cparams(9))
print("Time carray conv --> %.3f" % (time() - t0))
print("ac-->", repr(ac))

t0 = time()
sac = ac.sum()
#sac = ac.sum(dtype=np.dtype('i8'))
print("Time sum() carray --> %.3f" % (time() - t0))

# t0 = time()
# sac = sum(i for i in ac)
# print "Time sum() carray (iter) --> %.3f" % (time()-t0)

print("sa, sac-->", sa, sac, type(sa), type(sac))
assert (sa == sac)
Exemple #39
0
def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor,
                 **kwargs):
    """Perform the evaluation in blocks."""

    # Compute the optimal block size (in elements)
    # The next is based on experiments with bench/ctable-query.py
    # and the 'movielens-bench' repository
    if vm == "numexpr":
        bsize = 2**24
    else:
        bsize = 2**22
    bsize //= typesize
    # Evaluation seems more efficient if block size is a power of 2
    bsize = 2 ** (int(math.log(bsize, 2)))
    if vlen < 100*1000:
        bsize //= 8
    elif vlen < 1000*1000:
        bsize //= 4
    elif vlen < 10*1000*1000:
        bsize //= 2
    # Protection against too large atomsizes
    if bsize == 0:
        bsize = 1

    vars_ = {}
    # Get temporaries for vars
    maxndims = 0
    for name in vars:
        var = vars[name]
        if hasattr(var, "__len__"):
            ndims = len(var.shape) + len(var.dtype.shape)
            if ndims > maxndims:
                maxndims = ndims
            if len(var) > bsize and hasattr(var, "_getrange"):
                vars_[name] = np.empty(bsize, dtype=var.dtype)

    for i in xrange(0, vlen, bsize):
        # Get buffers for vars
        for name in vars:
            var = vars[name]
            if hasattr(var, "__len__") and len(var) > bsize:
                if hasattr(var, "_getrange"):
                    if i+bsize < vlen:
                        var._getrange(i, bsize, vars_[name])
                    else:
                        vars_[name] = var[i:]
                else:
                    vars_[name] = var[i:i+bsize]
            else:
                if hasattr(var, "__getitem__"):
                    vars_[name] = var[:]
                else:
                    vars_[name] = var

        # Perform the evaluation for this block
        if vm == "python":
            res_block = _eval(expression, vars_)
        else:
            try:
                res_block = bcolz.numexpr.evaluate(expression,
                                                   local_dict=vars_)
            except ValueError:
                # numexpr cannot handle this. Fall back to a pure "python" VM.
                return _eval_blocks(
                    expression, vars, vlen, typesize, "python",
                    out_flavor, **kwargs)

        if i == 0:
            # Detection of reduction operations
            scalar = False
            dim_reduction = False
            if len(res_block.shape) == 0:
                scalar = True
                result = res_block
                continue
            elif len(res_block.shape) < maxndims:
                dim_reduction = True
                result = res_block
                continue
            # Get a decent default for expectedlen
            if out_flavor == "carray":
                nrows = kwargs.pop('expectedlen', vlen)
                result = bcolz.carray(res_block, expectedlen=nrows, **kwargs)
            else:
                out_shape = list(res_block.shape)
                out_shape[0] = vlen
                result = np.empty(out_shape, dtype=res_block.dtype)
                result[:bsize] = res_block
        else:
            if scalar or dim_reduction:
                result += res_block
            elif out_flavor == "carray":
                result.append(res_block)
            else:
                result[i:i+bsize] = res_block

    if isinstance(result, bcolz.carray):
        result.flush()
    if scalar:
        return result[()]
    return result
Exemple #40
0
def test_load_array(tempdir):
    rootdir = tempdir.path
    bcolz.carray(np.arange(0, 5), mode='w', rootdir=rootdir)

    array = core.load_array(rootdir)
    np.testing.assert_equal(array, [0, 1, 2, 3, 4])
Exemple #41
0
def get_val_pair(path, name):
    rootdir = os.path.join(path, name)
    carray = bcolz.carray(rootdir=rootdir, mode='r')
    np_path = os.path.join(path, '{}_list.npy'.format(name))
    issame = np.load(np_path)
    return carray, issame
Exemple #42
0
This script processes and generates GloVe embeddings
'''
# coding: utf-8

import pickle
from preprocess import Vocabulary
import numpy as np
import json
from scipy import misc
import bcolz

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir='glove.6B/6B.300.dat', mode='w')

with open('glove.6B/glove.6B.300d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

vectors = bcolz.carray(vectors[1:].reshape((400000, 300)),
                       rootdir='glove.6B/6B.300.dat',
                       mode='w')
vectors.flush()
Exemple #43
0
def fromiter(iterable, dtype, count, **kwargs):
    """
    fromiter(iterable, dtype, count, **kwargs)

    Create a carray/ctable from an `iterable` object.

    Parameters
    ----------
    iterable : iterable object
        An iterable object providing data for the carray.
    dtype : numpy.dtype instance
        Specifies the type of the outcome object.
    count : int
        The number of items to read from iterable. If set to -1, means that
        the iterable will be used until exhaustion (not recommended, see note
        below).
    kwargs : list of parameters or dictionary
        Any parameter supported by the carray/ctable constructors.

    Returns
    -------
    out : a carray/ctable object

    Notes
    -----
    Please specify `count` to both improve performance and to save memory.  It
    allows `fromiter` to avoid looping the iterable twice (which is slooow).
    It avoids memory leaks to happen too (which can be important for large
    iterables).

    """
    # Check for a true iterable
    if not hasattr(iterable, "next"):
        iterable = iter(iterable)

    # Try to guess the final length
    expected = count
    if count == -1:
        # Try to guess the size of the iterable length
        if hasattr(iterable, "__length_hint__"):
            count = iterable.__length_hint__()
            expected = count

    # First, create the container
    expectedlen = kwargs.pop("expectedlen", expected)
    dtype = np.dtype(dtype)
    if dtype.kind == "V":
        # A ctable
        obj = bcolz.ctable(np.array([], dtype=dtype),
                           expectedlen=expectedlen,
                           **kwargs)
        chunklen = sum(obj.cols[name].chunklen
                       for name in obj.names) // len(obj.names)
    else:
        # A carray
        obj = bcolz.carray(np.array([], dtype=dtype),
                           expectedlen=expectedlen,
                           **kwargs)
        chunklen = obj.chunklen

    # Then fill it
    while True:
        chunk = np.fromiter(it.islice(iterable, chunklen), dtype=dtype)
        if len(chunk) == 0:
            # Iterable has been exhausted
            break
        obj.append(chunk)
    obj.flush()
    return obj
log = logging.getLogger(__name__)

log.debug("fine tune all layers")
log.debug("using all_model_weight_path :" + model_path)
log.debug("using test_result :" + test_result)
log.debug("using loss_history_csv_name :" + loss_history_csv_name)

train_name = basedir + '/pp_train_data'
valid_name = basedir + '/pp_valid_data'
test_name = basedir + '/pp_test_data'

temp_dir = "/tmp/"

## load original bcolz data from disk
# read from disk and check size
valid_data = bcolz.carray(rootdir=valid_name + '_data.bclz', mode='r')
test_data = bcolz.carray(rootdir=test_name + '_data.bclz', mode='r')
train_data = bcolz.carray(rootdir=train_name + '_data.bclz', mode='r')

valid_labels = bcolz.carray(rootdir=valid_name + '_labels.bclz', mode='r')
test_labels = bcolz.carray(rootdir=test_name + '_labels.bclz', mode='r')
train_labels = bcolz.carray(rootdir=train_name + '_labels.bclz', mode='r')

log.debug("loading original data from disk")
log.debug(valid_data.shape)
log.debug(test_data.shape)
log.debug(train_data.shape)

log.debug(valid_labels.shape)
log.debug(test_labels.shape)
log.debug(train_labels.shape)
Exemple #45
0
def arange(start=None, stop=None, step=None, dtype=None, **kwargs):
    """
    arange([start,] stop[, step,], dtype=None, **kwargs)

    Return evenly spaced values within a given interval.

    Values are generated within the half-open interval ``[start, stop)``
    (in other words, the interval including `start` but excluding `stop`).
    For integer arguments the function is equivalent to the Python built-in
    `range <http://docs.python.org/lib/built-in-funcs.html>`_ function,
    but returns a carray rather than a list.

    Parameters
    ----------
    start : number, optional
        Start of interval.  The interval includes this value.  The default
        start value is 0.
    stop : number
        End of interval.  The interval does not include this value.
    step : number, optional
        Spacing between values.  For any output `out`, this is the distance
        between two adjacent values, ``out[i+1] - out[i]``.  The default
        step size is 1.  If `step` is specified, `start` must also be given.
    dtype : dtype
        The type of the output array.  If `dtype` is not given, infer the data
        type from the other input arguments.
    kwargs : list of parameters or dictionary
        Any parameter supported by the carray constructor.

    Returns
    -------
    out : carray
        Bcolz object made of evenly spaced values.

        For floating point arguments, the length of the result is
        ``ceil((stop - start)/step)``.  Because of floating point overflow,
        this rule may result in the last element of `out` being greater
        than `stop`.

    """

    # Check start, stop, step values
    if (start, stop) == (None, None):
        raise ValueError("You must pass a `stop` value at least.")
    elif stop is None:
        start, stop = 0, start
    elif start is None:
        start, stop = 0, stop
    if step is None:
        step = 1

    # Guess the dtype
    if dtype is None:
        if type(stop) in _inttypes:
            dtype = np.dtype(np.int_)
    dtype = np.dtype(dtype)
    stop = int(stop)

    # Create the container
    expectedlen = kwargs.pop("expectedlen", stop)
    if dtype.kind == "V":
        raise ValueError("arange does not support ctables yet.")
    else:
        obj = bcolz.carray(np.array([], dtype=dtype),
                           expectedlen=expectedlen,
                           **kwargs)
        chunklen = obj.chunklen

    # Then fill it
    incr = chunklen * step  # the increment for each chunk
    incr += step - (incr % step)  # make it match step boundary
    bstart, bstop = start, start + incr
    while bstart < stop:
        if bstop > stop:
            bstop = stop
        chunk = np.arange(bstart, bstop, step, dtype=dtype)
        obj.append(chunk)
        bstart = bstop
        bstop += incr
    obj.flush()
    return obj
def test_generator():
    """Simple function to test return behavior of generator code above.

    This runs with and without merged model version.

df_train:
      object_id                                imgpath  target                          orig label
7          1518  /tmp/path/to/imgs/518/01/dog_1518.jpg       1  data/train/dogs/dog.1518.jpg   dog
1113       1662  /tmp/path/to/imgs/662/01/cat_1662.jpg       0  data/train/cats/cat.1662.jpg   cat
980        1409  /tmp/path/to/imgs/409/01/dog_1409.jpg       1  data/train/dogs/dog.1409.jpg   dog
1615       1813  /tmp/path/to/imgs/813/01/cat_1813.jpg       0  data/train/cats/cat.1813.jpg   cat
1029       1760  /tmp/path/to/imgs/760/01/cat_1760.jpg       0  data/train/cats/cat.1760.jpg   cat
df_valid:

     object_id                                imgpath  target                               orig label
787       7747  /tmp/path/to/imgs/747/07/cat_7747.jpg       0  data/validation/cats/cat.7747.jpg   cat
165       7563  /tmp/path/to/imgs/563/07/dog_7563.jpg       1  data/validation/dogs/dog.7563.jpg   dog
749       7517  /tmp/path/to/imgs/517/07/cat_7517.jpg       0  data/validation/cats/cat.7517.jpg   cat
458       7742  /tmp/path/to/imgs/742/07/cat_7742.jpg       0  data/validation/cats/cat.7742.jpg   cat
225       7479  /tmp/path/to/imgs/479/07/dog_7479.jpg       1  data/validation/dogs/dog.7479.jpg   dog

    """

    pd.np.set_printoptions(linewidth=150)

    df_train, df_valid = get_demo_data()

    img_width, img_height = 150, 150
    batch_size = 64
    target_size = (img_width, img_height)

    print("\nTest basic generator.\n")
    for df in (df_train, df_valid):
        i = 0
        for X, Y in generator_from_df(df,
                                      batch_size,
                                      target_size,
                                      features=None):
            print(X[:3, :3, 0])
            print(Y[:3])
            i += 1
            if i > 1:
                break

    # Create random array for bcolz test.
    #
    # In the end, this test does not use bcolz.
    # But, if it did, here are some hints to get you there.
    print("\nTest merged generator.\n")

    nfeatures = 74

    # features_train = pd.np.random.randn(df_train.shape[0], nfeatures)
    # features_valid = pd.np.random.randn(df_valid.shape[0], nfeatures)

    # Make a 2D array, where each row is filled with the values of its
    # index, which will be very convenient for testing the merged
    # model generator.
    # [[0, 0, 0, ...],
    #  [1, 1, 1, ...],
    #  [2, 2, 2, ...],
    #  ...
    # ]
    features_train = np.repeat(np.arange(df_train.shape[0],
                                         dtype=float).reshape((-1, 1)),
                               nfeatures,
                               axis=1)
    features_valid = np.repeat(np.arange(df_valid.shape[0],
                                         dtype=float).reshape((-1, 1)),
                               nfeatures,
                               axis=1)

    # Add a litle noise in [0, 1] just to pretend we have "real" data.
    features_train += np.random.rand(*features_train.shape)
    features_valid += np.random.rand(*features_valid.shape)

    fname_train = "mm_features_train_bc"
    if not os.path.exists(fname_train):
        c = bcolz.carray(features_train, rootdir=fname_train, mode='w')
        c.flush()

    fname_valid = "mm_features_valid_bc"
    if not os.path.exists(fname_valid):
        c = bcolz.carray(features_valid, rootdir=fname_valid, mode='w')
        c.flush()

    # Big assumption here: each row of a features matrix corresponds
    # exactly with the image represented by the row of the associated
    # train or valid df.  *YOU* will have to ensure this in your own
    # code.  This is only demo code!

    for df, fname in ((df_train, fname_train), (df_valid, fname_valid)):

        nbatches = df.shape[0] / float(batch_size)

        for i, ((X, features), Y) in enumerate(
                generator_from_df(df,
                                  batch_size,
                                  target_size,
                                  features=fname,
                                  debug_merged=True)):

            if i == 0:
                print(X[:3, :3, 0])
                print(features[:3, :5])
                print(Y[:3])
            else:
                if (i + 1) % 20 == 0:
                    print("%d / %d" % (i + i, nbatches), end=', ')
                    sys.stdout.flush()

            # Keras automatically breaks out of the infinite "while 1"
            # loop in the generator_from_df().  For this test, we need
            # to break manually.
            if i >= nbatches:
                break

    print("\nSuccessful (I think...) test of multithreaded read of bcolz!")

    print("Note that for this test, all of the above X2 rows should"\
          "have the same int() values within a row.")
import bcolz
import numpy as np
import pickle
import pandas as pd
import json

''' This file creates the matrix that is needed to convert the words in the data set to word embedding vectors. The 
    word embedder used is GloVe.
'''

glove_path = "glove_6B"

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/6B.50.dat', mode='w')

with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'{glove_path}/6B.50.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb'))
def get_val_pair(path, name):
    carray = bcolz.carray(rootdir=os.path.join(path, name), mode='r')
    issame = np.load('{}/{}_list.npy'.format(path, name))

    return carray, issame
Exemple #49
0
def save_array(fname, arr):
    " save np matrix or array"
    c = bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
Exemple #50
0
    def _write_internal(self, iterator, assets):
        """
        Internal implementation of write.

        `iterator` should be an iterator yielding pairs of (asset, ctable).
        """
        total_rows = 0
        first_row = {}
        last_row = {}
        calendar_offset = {}

        # Maps column name -> output carray.
        columns = {
            k: carray(array([], dtype=uint32))
            for k in US_EQUITY_PRICING_BCOLZ_COLUMNS
        }

        earliest_date = None
        sessions = self._calendar.sessions_in_range(self._start_session,
                                                    self._end_session)

        if assets is not None:

            @apply
            def iterator(iterator=iterator, assets=set(assets)):
                for asset_id, table in iterator:
                    if asset_id not in assets:
                        raise ValueError('unknown asset id %r' % asset_id)
                    yield asset_id, table

        count = 0
        for asset_id, table in iterator:
            nrows = len(table)
            for column_name in columns:
                if column_name == 'id':
                    # We know what the content of this column is, so don't
                    # bother reading it.
                    columns['id'].append(
                        full((nrows, ), asset_id, dtype='uint32'), )
                    continue

                columns[column_name].append(table[column_name])

            if earliest_date is None:
                earliest_date = table["day"][0]
            else:
                earliest_date = min(earliest_date, table["day"][0])

            # Bcolz doesn't support ints as keys in `attrs`, so convert
            # assets to strings for use as attr keys.
            asset_key = str(asset_id)

            # Calculate the index into the array of the first and last row
            # for this asset. This allows us to efficiently load single
            # assets when querying the data back out of the table.
            first_row[asset_key] = total_rows
            last_row[asset_key] = total_rows + nrows - 1
            total_rows += nrows

            table_day_to_session = compose(
                self._calendar.minute_to_session_label,
                partial(Timestamp, unit='s', tz='UTC'),
            )
            asset_first_day = table_day_to_session(table['day'][0])
            asset_last_day = table_day_to_session(table['day'][-1])

            asset_sessions = sessions[sessions.slice_indexer(
                asset_first_day, asset_last_day)]
            assert len(table) == len(asset_sessions), (
                'Got {} rows for daily bars table with first day={}, last '
                'day={}, expected {} rows.\n'
                'Missing sessions: {}\n'
                'Extra sessions: {}'.format(
                    len(table),
                    asset_first_day.date(),
                    asset_last_day.date(),
                    len(asset_sessions),
                    asset_sessions.difference(
                        to_datetime(
                            np.array(table['day']),
                            unit='s',
                            utc=True,
                        )).tolist(),
                    to_datetime(
                        np.array(table['day']),
                        unit='s',
                        utc=True,
                    ).difference(asset_sessions).tolist(),
                ))

            # Calculate the number of trading days between the first date
            # in the stored data and the first date of **this** asset. This
            # offset used for output alignment by the reader.
            calendar_offset[asset_key] = sessions.get_loc(asset_first_day)
            count = count + 1

        if count == 0:
            return

        # This writes the table to disk.
        full_table = ctable(
            columns=[
                columns[colname] for colname in US_EQUITY_PRICING_BCOLZ_COLUMNS
            ],
            names=US_EQUITY_PRICING_BCOLZ_COLUMNS,
            rootdir=self._filename,
            mode='w',
        )

        full_table.attrs['first_trading_day'] = (earliest_date if earliest_date
                                                 is not None else iNaT)

        full_table.attrs['first_row'] = first_row
        full_table.attrs['last_row'] = last_row
        full_table.attrs['calendar_offset'] = calendar_offset
        full_table.attrs['calendar_name'] = self._calendar.name
        full_table.attrs['start_session_ns'] = self._start_session.value
        full_table.attrs['end_session_ns'] = self._end_session.value
        full_table.flush()
        return full_table
Exemple #51
0
    def _write_internal(self, iterator, assets):
        """
        Internal implementation of write.
        `iterator` should be an iterator yielding pairs of (asset, ctable).
        """
        total_rows = 0
        first_row = {}
        last_row = {}
        calendar_offset = {}

        # Maps column name -> output carray adding int64 for the greeks
        columns = {
            k: carray(
                array(
                    [],
                    dtype=(uint32_dtype if k not in GREEKS else int64_dtype)))
            for k in OPTION_PRICING_BCOLZ_COLUMNS
        }

        earliest_date = None
        sessions = self._calendar.sessions_in_range(self._start_session,
                                                    self._end_session)

        if assets is not None:

            @apply
            def iterator(iterator=iterator, assets=set(assets)):
                for asset_id, table in iterator:
                    if asset_id not in assets:
                        raise ValueError("unknown asset id %r" % asset_id)
                    yield asset_id, table

        for asset_id, table in iterator:

            logger.info(f"Writing asset id {asset_id} to disk")

            nrows = len(table)
            for column_name in columns:
                if column_name == "id":
                    # We know what the content of this column is, so don't
                    # bother reading it.
                    columns["id"].append(
                        full((nrows, ), asset_id, dtype="uint32"))
                    continue

                columns[column_name].append(table[column_name])

            if earliest_date is None:
                earliest_date = table["day"][0]
            else:
                earliest_date = min(earliest_date, table["day"][0])

            # Bcolz doesn't support ints as keys in `attrs`, so convert
            # assets to strings for use as attr keys.
            asset_key = str(asset_id)

            # Calculate the index into the array of the first and last row
            # for this asset. This allows us to efficiently load single
            # assets when querying the data back out of the table.
            first_row[asset_key] = total_rows
            last_row[asset_key] = total_rows + nrows - 1
            total_rows += nrows

            table_day_to_session = compose(
                self._calendar.minute_to_session_label,
                partial(Timestamp, unit="s", tz="UTC"),
            )
            asset_first_day = table_day_to_session(table["day"][0])
            asset_last_day = table_day_to_session(table["day"][-1])

            asset_sessions = sessions[sessions.slice_indexer(
                asset_first_day, asset_last_day)]
            assert len(table) == len(asset_sessions), (
                "Got {} rows for daily bars table with first day={}, last "
                "day={}, expected {} rows.\n"
                "Missing sessions: {}\n"
                "Extra sessions: {}".format(
                    len(table),
                    asset_first_day.date(),
                    asset_last_day.date(),
                    len(asset_sessions),
                    asset_sessions.difference(
                        to_datetime(np.array(table["day"]), unit="s",
                                    utc=True)).tolist(),
                    to_datetime(np.array(table["day"]), unit="s",
                                utc=True).difference(asset_sessions).tolist(),
                ))

            # Calculate the number of trading days between the first date
            # in the stored data and the first date of **this** asset. This
            # offset used for output alignment by the reader.
            calendar_offset[asset_key] = sessions.get_loc(asset_first_day)

        logger.info("Writing complete table to disk")
        # This writes the table to disk.
        full_table = ctable(
            columns=[
                columns[colname] for colname in OPTION_PRICING_BCOLZ_COLUMNS
            ],
            names=OPTION_PRICING_BCOLZ_COLUMNS,
            rootdir=self._filename,
            mode="w",
        )

        full_table.attrs["first_trading_day"] = (earliest_date if earliest_date
                                                 is not None else iNaT)

        full_table.attrs["first_row"] = first_row
        full_table.attrs["last_row"] = last_row
        full_table.attrs["calendar_offset"] = calendar_offset
        full_table.attrs["calendar_name"] = self._calendar.name
        full_table.attrs["start_session_ns"] = self._start_session.value
        full_table.attrs["end_session_ns"] = self._end_session.value
        full_table.flush()
        return full_table
Exemple #52
0
def save_array(f, arr):
    try_mkdir(os.path.dirname(f))
    c = bcolz.carray(arr, rootdir=f, mode='w')
    c.flush()
Exemple #53
0
def get_val_pair(path, name):
    carray = bcolz.carray(rootdir=path / name, mode="r")
    issame = np.load(path / "{}_list.npy".format(name))
    return carray, issame
Exemple #54
0
def save_array(fname, arr):
    c = bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
Exemple #55
0
def convert_numpy_to_bcolz_carray(x, **kwargs):
    return carray(x, **keyfilter(keywords.__contains__, kwargs))
Exemple #56
0
def save_array(fname, arr):
    bcolz.carray(arr, rootdir=fname, mode='w')
Exemple #57
0
import bcolz
import os
import torch
import numpy as np
from PIL import Image

path = "/ssd-data/lmd/eval_dbs"

names = ["agedb_30"]
for name in names:
    carray = bcolz.carray(rootdir=os.path.join(path, name), mode="r")
    print(carray.shape)
    print(carray[-1].transpose(1, 2, 0).shape)
    print((carray[-1].transpose(1, 2, 0))[55:65, 55:65])
    img = Image.fromarray((carray[-1].transpose(1, 2, 0).astype(np.float32) *
                           255).astype(np.uint8))
    img.save("/data2/lmd_jdq/cfp-fp/%d.jpg" % 0)
    for i in range(1, 20):
        print(np.sum(carray[-i] - carray[-1]))
#Data loader
import cython
import numpy as np
import bcolz
import pickle
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.autograd import Variable

#Dataloader
words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1),
                       rootdir=f'/Users/nilslager/Desktop/gitit.50.dat',
                       mode='w')

#Open up GloVe embeddings and create vectors
with open(f'/Users/nilslager/Desktop/wv_50d_gitit.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

#Construct pickle files
vectors = bcolz.carray(vectors[1:].reshape((400000, 50)),
Exemple #59
0
def save_array(data, fname):
    print("Saving image dataset at the location " + str(fname) + ".")
    c = bcolz.carray(data, rootdir=fname, mode='w')
    c.flush()
Exemple #60
0
def save_array(fname, arr):
    c = bz.carray(arr, rootdir=fname, mode=w)
    c.flush()