Python carray Examples, bcolz.carray Python Examples

Example #1

0

Show file

File: cframe.py Project: OspreyX/dask

 def __setstate__(self, d):
     self.rootdir = d['rootdir']
     self.columns = d['columns']
     self.blocks = dict((col, bcolz.carray(rootdir=os.path.join(self.rootdir, '%s.bcolz' % col)))
             for col in self.columns)
     self.index = bcolz.carray(rootdir=os.path.join(self.rootdir, 'index.bcolz'))
     self._explicitly_given_path = True

Example #2

0

Show file

File: ctable.py Project: mrocklin/bquery

    def factorize_groupby_cols(self, groupby_cols):
        """

        :type self: ctable
        """
        # first check if the factorized arrays already exist
        # unless we need to refresh the cache
        factor_list = []
        values_list = []

        # factorize the groupby columns
        for col in groupby_cols:

            if self.cache_valid(col):
                col_rootdir = self[col].rootdir
                col_factor_rootdir = col_rootdir + '.factor'
                col_values_rootdir = col_rootdir + '.values'
                col_factor_carray = \
                    bcolz.carray(rootdir=col_factor_rootdir, mode='r')
                col_values_carray = \
                    bcolz.carray(rootdir=col_values_rootdir, mode='r')
            else:
                col_factor_carray, values = ctable_ext.factorize(self[col])
                col_values_carray = \
                    bcolz.carray(values.values(), dtype=self[col].dtype)

            factor_list.append(col_factor_carray)
            values_list.append(col_values_carray)

        return factor_list, values_list

Example #3

0

Show file

File: bcolz.py Project: mrocklin/into

def resource_bcolz(uri, dshape=None, expected_dshape=None, **kwargs):
    if os.path.exists(uri):
        try:
            return ctable(rootdir=uri)
        except IOError:  # __rootdirs__ doesn't exist because we aren't a ctable
            return carray(rootdir=uri)
    else:
        if not dshape:
            raise ValueError("Must specify either existing bcolz directory or"
                             " valid datashape")
        dshape = datashape.dshape(dshape)

        dt = datashape.to_numpy_dtype(dshape)
        shape_tail = tuple(map(int, dshape.shape[1:]))  # tail of shape
        if dshape.shape[0] == datashape.var:
            shape = (0,) + shape_tail
        else:
            shape = (int(dshape.shape[0]),) + shape_tail

        x = np.empty(shape=shape, dtype=dt)

        kwargs = keyfilter(keywords.__contains__, kwargs)
        expectedlen = kwargs.pop('expectedlen',
                                 int(expected_dshape[0])
                                 if expected_dshape is not None and
                                 isinstance(expected_dshape[0], datashape.Fixed)
                                 else None)

        if datashape.predicates.isrecord(dshape.measure):
            return ctable(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
        else:
            return carray(x, rootdir=uri, expectedlen=expectedlen, **kwargs)

Example #4

0

Show file

File: bc.py Project: cottrell/uk_land_registry

def _from_carray(path, format_categories=None, format_codes=None, format_values=None):
    meta = json.load(open(os.path.join(path, 'meta'), 'r'))

    if meta['type'] == 'category':
        if format_categories in ['npz', 'npy']:
            filename = os.path.join(path, 'categories.%s' % format_categories)
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                categories_values = numpy.load(filename, mmap_mode='r+') # TODO npz not memmap?
                if format_categories == 'npz':
                    categories_values = categories_values['arr_0']
        elif format_categories == 'pickle':
            filename = os.path.join(path, 'categories.pickle')
            with log.timedlogger("reading [%s] %s" % (meta['name'], filename)):
                categories_values = pickle.load(open(filename, 'rb'))
        elif format_categories == 'bcolz':
            rootdir = os.path.join(path, 'categories.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                categories_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r')
                # categories_values = bcolz.carray(rootdir=rootdir, mode='r')[:]
        else:
            raise NotImplementedError("uh oh %s" % (meta['type'],))

        if format_codes == 'bcolz':
            rootdir = os.path.join(path, 'codes.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                codes_values = bcolz.carray(rootdir=rootdir, mode='r')[:] # , categories=categories_values)
                # codes_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') # , categories=categories_values)
        elif format_codes == 'npy':
            filename = os.path.join(path, 'codes.npy')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                codes_values = numpy.load(filename, mmap_mode='r+')
        else:
            raise Exception("unknown format_codes type %s" % (format_codes,))

        with log.timedlogger("FastCat construction"):
            s = FastCat(codes_values, categories_values)
    else:
        if format_values == 'bcolz':
            rootdir = os.path.join(path, 'values.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                # values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r')
                s = bcolz.carray(rootdir=rootdir, mode='r')[:]
        elif format_values == 'npy':
            filename = os.path.join(path, 'values.npy')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                s = numpy.load(filename, mmap_mode='r+')
        elif format_values == 'pickle':
            filename = os.path.join(path, 'values.pickle')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                s = pickle.load(open(filename, 'rb'))
        # with log.timedlogger("FastSeries construction"):
        #     index = pandas.Index(numpy.arange(len(values)), copy=False)
        #     values = SingleBlockManager(values, index, fastpath=True)
        #     s = pandas.Series(data=values, fastpath=True, copy=False, dtype=meta['type'])
        # s = values # [:]
    # logging.warning('Constructing categorical for %s' % meta['name'])
    # s = pandas.Categorical.from_codes(codes_values, categories_values, name=meta['name'])
    return meta, s # codes_values, categories_values

Example #5

0

Show file

File: utils.py Project: Blosc/bcolz

def build_carray(array, rootdir):
    """ Used in ctable.__reduce__

    Pickling functions can't be in pyx files.  Putting this tiny helper
    function here instead.
    """
    from bcolz import carray
    if rootdir:
        return carray(rootdir=rootdir)
    else:
        return carray(array)

Example #6

0

Show file

File: bc.py Project: cottrell/notebooks

def to_dict_of_blocks(d, rootdir):
    """ deprecated. for pure numpy things like {'X_train': X_train, 'X_test': X_test} """
    if os.path.exists(rootdir):
        _move_and_remove_nonblocking(rootdir)
    _mkdir(rootdir)
    meta = {'keys': list(d.keys())}
    json.dump(meta, open(os.path.join(rootdir, 'meta'), 'w'))
    for i, k in enumerate(meta['keys']):
        filename = os.path.join(rootdir, str(i))
        with log.timedlogger('writing {} ({}.shape = {})'.format(filename, k, d[k].shape)):
            bcolz.carray(d[k], rootdir=filename)

Example #7

0

Show file

File: xtctrr.py Project: hainm/chemlab

    def handle_frame(self, i, frame):
        if i == 0:
            self.frames = bcolz.carray(np.zeros((0,) + frame.coords.shape, dtype="float32"),
                                 rootdir=os.path.join(self.rootdir, "coords"),
                                 mode='w')
            self.frames.attrs['timestamp'] = self.timestamp
            self.boxes = bcolz.carray(np.zeros((0,) + frame.box.shape, dtype="float32"),
                                 rootdir=os.path.join(self.rootdir, "boxes"),
                                 mode='w')
            self.times = []

        self.frames.append(frame.coords)
        self.boxes.append(frame.box)
        self.times.append(frame.time)

Example #8

0

Show file

File: ctable.py Project: mrocklin/bquery

    def unique(self, col_or_col_list):
        """
        Return a list of unique values of a column or a list of lists of column list

        :param col_or_col_list: a column or a list of columns
        :return:
        """

        if isinstance(col_or_col_list, list):
            col_is_list = True
            col_list = col_or_col_list
        else:
            col_is_list = False
            col_list = [col_or_col_list]

        output = []

        for col in col_list:

            if self.cache_valid(col):
                # retrieve values from existing disk-based factorization
                col_values_rootdir = self[col].rootdir + '.values'
                carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r')
                values = list(carray_values)
            else:
                # factorize on-the-fly
                _, values = ctable_ext.factorize(self[col])
                values = values.values()

            output.append(values)

        if not col_is_list:
            output = output[0]

        return output

Example #9

0

Show file

File: toplevel.py Project: B-Rich/bcolz

def open(rootdir, mode='a'):
    """
    open(rootdir, mode='a')

    Open a disk-based carray/ctable.

    Parameters
    ----------
    rootdir : pathname (string)
        The directory hosting the carray/ctable object.
    mode : the open mode (string)
        Specifies the mode in which the object is opened.  The supported
        values are:

          * 'r' for read-only
          * 'w' for emptying the previous underlying data
          * 'a' for allowing read/write on top of existing data

    Returns
    -------
    out : a carray/ctable object or None (if not objects are found)

    """
    # First try with a carray
    obj = None
    try:
        obj = bcolz.carray(rootdir=rootdir, mode=mode)
    except IOError:
        # Not a carray.  Now with a ctable
        try:
            obj = bcolz.ctable(rootdir=rootdir, mode=mode)
        except IOError:
            # Not a ctable
            pass
    return obj

Example #10

0

Show file

File: mdf.py Project: lepy/mdfreader

    def setChannelData(self, channelName, data, compression=False):
        """Modifies data of channel

        Parameters
        ----------------
        channelName : str
            channel name
        data : numpy array
            channel data
        compression : bool or str
            trigger for data compression
        """
        if compression and CompressionPossible:
            if not isinstance(compression, str):
                if isinstance(compression, int):
                    comp = compression
                else:
                    comp = self._compression_level
                temp = carray(data,
                              cparams=cparams(clevel=comp),
                              expectedlen=int(getsizeof(data) / 10))
            else:
                temp = compressed_data()
                temp.compression(data)
            self._setChannel(channelName, temp, field=dataField)
        else:
            self._setChannel(channelName, data, field=dataField)

Example #11

0

Show file

File: test_carray_objects.py Project: Blosc/bcolz

 def test_create_unsafe_carray_with_unsafe_data(self):
     """ We introduce a safe keyword arg which removes dtype checking.
     We don't want this to interfere with creation.
     """
     b = bcolz.carray([1, 2, 3], dtype='i4', safe=False)
     self.assertEqual(b.safe, False)
     self.assertEqual(b[0], 1)

Example #12

0

Show file

File: toplevel.py Project: FrancescAlted/bcolz2

def open(rootdir, mode='a'):
    """
    open(rootdir, mode='a')

    Open a disk-based carray/ctable.

    Parameters
    ----------
    rootdir : pathname (string)
        The directory hosting the carray/ctable object.
    mode : the open mode (string)
        Specifies the mode in which the object is opened.  The supported
        values are:

          * 'r' for read-only
          * 'w' for emptying the previous underlying data
          * 'a' for allowing read/write on top of existing data

    Returns
    -------
    out : a carray/ctable object or IOError (if not objects are found)

    """
    # First try with a carray
    rootsfile = os.path.join(rootdir, ROOTDIRS)
    if os.path.exists(rootsfile):
        return bcolz.ctable(rootdir=rootdir, mode=mode)
    else:
        return bcolz.carray(rootdir=rootdir, mode=mode)

Example #13

0

Show file

File: toplevel.py Project: OwaJawa/bcolz

def fill(shape, dflt=None, dtype=np.float, **kwargs):
    """
    fill(shape, dtype=float, dflt=None, **kwargs)

    Return a new carray object of given shape and type, filled with `dflt`.

    Parameters
    ----------
    shape : int
        Shape of the new array, e.g., ``(2,3)``.
    dflt : Python or NumPy scalar
        The value to be used during the filling process.  If None, values are
        filled with zeros.  Also, the resulting carray will have this value as
        its `dflt` value.
    dtype : data-type, optional
        The desired data-type for the array, e.g., `numpy.int8`.  Default is
        `numpy.float64`.
    kwargs : list of parameters or dictionary
        Any parameter supported by the carray constructor.

    Returns
    -------
    out : carray
        Array filled with `dflt` values with the given shape and dtype.

    See Also
    --------
    ones, zeros

    """

    dtype = np.dtype(dtype)
    if type(shape) in _inttypes + (float,):
        shape = (int(shape),)
    else:
        shape = tuple(shape)
        if len(shape) > 1:
            # Multidimensional shape.
            # The atom will have shape[1:] dims (+ the dtype dims).
            dtype = np.dtype((dtype.base, shape[1:]+dtype.shape))
    length = shape[0]

    # Create the container
    expectedlen = kwargs.pop("expectedlen", length)
    if dtype.kind == "V" and dtype.shape == ():
        raise ValueError("fill does not support ctables objects")
    obj = bcolz.carray([], dtype=dtype, dflt=dflt, expectedlen=expectedlen,
                       **kwargs)
    chunklen = obj.chunklen

    # Then fill it
    # We need an array for the default so as to keep the atom info
    dflt = np.array(obj.dflt, dtype=dtype)
    # Making strides=(0,) below is a trick to create the array fast and
    # without memory consumption
    chunk = np.ndarray(length, dtype=dtype, buffer=dflt, strides=(0,))
    obj.append(chunk)
    obj.flush()
    return obj

Example #14

0

Show file

File: test_ndcarray.py Project: OwaJawa/bcolz

 def test00(self):
     """Testing unicode types (creation)"""
     a = np.array([[u"aŀle", u"eñe"], [u"açò", u"áèâë"]], dtype="U4")
     b = bcolz.carray(a)
     # print "b.dtype-->", b.dtype
     # print "b->", `b`
     self.assertTrue(a.dtype == b.dtype.base)
     assert_array_equal(a, b[:], "Arrays are not equal")

Example #15

0

Show file

File: column.py Project: mpage38/dazzle

 def set_type(self, dtype):
     if self.dtype != dtype:
         self.fill_nan(self.nan_value(dtype=dtype))
         carray = bcolz.carray(self.carray, dtype= np.dtype(dtype).name)  # TODO do it chunk by chunk + check data
         ct = self._table._ctable
         col_pos = self.position
         ct.delcol(self._name)
         ct.addcol(carray, name=self._name, pos=col_pos)

Example #16

0

Show file

File: bcolz.py Project: leolujuyi/blaze

def into(a, b, **kwargs):
    if isinstance(a, type):
        kwargs = keyfilter(carray_keywords.__contains__, kwargs)
        return carray(b, **kwargs)
    else:
        a.append(b)
        a.flush()
        return a

Example #17

0

Show file

File: test_ndcarray.py Project: OwaJawa/bcolz

 def test00(self):
     """Testing string types (creation)"""
     a = np.array([["ale", "ene"], ["aco", "ieie"]], dtype="S4")
     b = bcolz.carray(a)
     # print "b.dtype-->", b.dtype
     # print "b->", `b`
     self.assertTrue(a.dtype == b.dtype.base)
     assert_array_equal(a, b[:], "Arrays are not equal")

Example #18

0

Show file

File: column.py Project: mpage38/dazzle

    def __init__(self, val):
        super().__init__()

        if isinstance(val, bcolz.carray):
            self._carray = val
        elif isinstance(val, list) or isinstance(val, np.ndarray):
            self._carray = bcolz.carray(val, expectedlen=Column.DEFAULT_BLOCK_LEN)
        else:
            raise DazzleError("Invalid argument in ResultColumn.%s()" % method_name())

Example #19

0

Show file

File: us_equity_pricing.py Project: nborggren/zipline

    def _write_internal(self, filename, calendar, iterator):
        """
        Internal implementation of write.

        `iterator` should be an iterator yielding pairs of (asset, ctable).
        """
        total_rows = 0
        first_row = {}
        last_row = {}
        calendar_offset = {}

        # Maps column name -> output carray.
        columns = {k: carray(array([], dtype=uint32)) for k in US_EQUITY_PRICING_BCOLZ_COLUMNS}

        for asset_id, table in iterator:
            nrows = len(table)
            for column_name in columns:
                if column_name == "id":
                    # We know what the content of this column is, so don't
                    # bother reading it.
                    columns["id"].append(full((nrows,), asset_id, uint32))
                    continue
                columns[column_name].append(self.to_uint32(table[column_name][:], column_name))

            # Bcolz doesn't support ints as keys in `attrs`, so convert
            # assets to strings for use as attr keys.
            asset_key = str(asset_id)

            # Calculate the index into the array of the first and last row
            # for this asset. This allows us to efficiently load single
            # assets when querying the data back out of the table.
            first_row[asset_key] = total_rows
            last_row[asset_key] = total_rows + nrows - 1
            total_rows += nrows

            # Calculate the number of trading days between the first date
            # in the stored data and the first date of **this** asset. This
            # offset used for output alignment by the reader.

            # HACK: Index with a list so that we get back an array we can pass
            # to self.to_uint32.  We could try to extract this in the loop
            # above, but that makes the logic a lot messier.
            asset_first_day = self.to_uint32(table["day"][[0]], "day")[0]
            calendar_offset[asset_key] = calendar.get_loc(Timestamp(asset_first_day, unit="s", tz="UTC"))

        # This writes the table to disk.
        full_table = ctable(
            columns=[columns[colname] for colname in US_EQUITY_PRICING_BCOLZ_COLUMNS],
            names=US_EQUITY_PRICING_BCOLZ_COLUMNS,
            rootdir=filename,
            mode="w",
        )
        full_table.attrs["first_row"] = first_row
        full_table.attrs["last_row"] = last_row
        full_table.attrs["calendar_offset"] = calendar_offset
        full_table.attrs["calendar"] = calendar.asi8.tolist()
        return full_table

Example #20

0

Show file

File: minute_bars.py Project: RoyHsiao/zipline

    def _open_minute_file(self, field, sid):
        sid = int(sid)

        try:
            carray = self._carrays[field][sid]
        except KeyError:
            carray = self._carrays[field][sid] = bcolz.carray(rootdir=self._get_carray_path(sid, field), mode="r")

        return carray

Example #21

0

Show file

File: test_ndcarray.py Project: OwaJawa/bcolz

 def test01(self):
     """Testing unicode types (append)"""
     a = np.ones((300, 4), dtype="U4")
     b = bcolz.carray([], dtype="U4").reshape((0, 4))
     b.append(a)
     # print "b.dtype-->", b.dtype
     # print "b->", `b`
     self.assertTrue(a.dtype == b.dtype.base)
     assert_array_equal(a, b[:], "Arrays are not equal")

Example #22

0

Show file

File: test_carray_objects.py Project: Blosc/bcolz

 def test_carray_record_as_object(self):
     src_data = np.empty((10,), dtype=np.dtype('u1,O'))
     src_data[:] = [(i, 's'*i) for i in range(10)]
     carr = bcolz.carray(src_data, dtype=np.dtype('O'))
     self.assertEqual(len(carr.shape), 1)
     self.assertEqual(len(src_data), carr.shape[0])
     for i in range(len(carr)):
         self.assertEqual(carr[i][0], src_data[i][0])
         self.assertEqual(carr[i][1], src_data[i][1])

Example #23

0

Show file

File: test_ndcarray.py Project: OwaJawa/bcolz

 def test03c(self):
     """Testing `__getitem()__` method with several slices (III)"""
     a = np.arange(120 * 1000).reshape((5 * 1000, 4, 3, 2))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (slice(None, None, 3), slice(1, 3, 2), slice(1, 4, 2))
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")

Example #24

0

Show file

File: test_ndcarray.py Project: OwaJawa/bcolz

 def test05c(self):
     """Testing `__getitem()__` method with fancy indexing (III)"""
     a = np.arange(2000).reshape((50, 40))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (slice(None), [0, 2])
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")

Example #25

0

Show file

File: test_carray_objects.py Project: Blosc/bcolz

    def test_carray_1d_source(self):
        """Testing carray of objects, 1d source"""
        src_data = ['s'*i for i in range(10)]
        carr = bcolz.carray(src_data, dtype=np.dtype('O'))

        self.assertEqual(len(carr.shape), 1)
        self.assertEqual(len(src_data), carr.shape[0])
        for i in range(len(carr)):
            self.assertEqual(carr[i], src_data[i])
            self.assertEqual(carr[i], src_data[i])

Example #26

0

Show file

File: test_ndcarray.py Project: OwaJawa/bcolz

 def test04c(self):
     """Testing `__getitem()__` method with shape reduction (III)"""
     a = np.arange(6000).reshape((50, 40, 3))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (1, slice(1, 4, 2), 2)
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")

Example #27

0

Show file

File: test_ndcarray.py Project: B-Rich/bcolz

 def test00(self):
     """Testing sum()."""
     a = np.arange(1e5).reshape(10, 1e4)
     sa = a.sum()
     ac = bcolz.carray(a)
     sac = ac.sum()
     #print "numpy sum-->", sa
     #print "carray sum-->", sac
     self.assert_(sa.dtype == sac.dtype, "sum() is not working correctly.")
     self.assert_(sa == sac, "sum() is not working correctly.")

Example #28

0

Show file

File: bcolz_backend.py Project: sdvillal/jagged

 def _open_write(self, data=None):
     if self._bcolz is None:
         try:  # append
             self._bcolz = \
                 bcolz.carray(None,
                              rootdir=self._bcolz_dir(),
                              mode='a',
                              # bcolz conf in case mode='a' semantics change to create, otherwise innocuous
                              chunklen=self.chunklen,
                              expectedlen=self.expectedlen,
                              cparams=self.cparams)
         except:  # create
             self._bcolz = \
                 bcolz.carray(data[0:0],
                              rootdir=self._bcolz_dir(),
                              mode='w',
                              chunklen=self.chunklen,
                              expectedlen=self.expectedlen,
                              cparams=self.cparams)

Example #29

0

Show file

File: ctable.py Project: B-Rich/bcolz

 def read_meta_and_open(self):
     """Read the meta-information and initialize structures."""
     # Get the directories of the columns
     rootsfile = os.path.join(self.rootdir, ROOTDIRS)
     with open(rootsfile, "rb") as rfile:
         data = json.loads(rfile.read())
     # JSON returns unicode (?)
     self.names = [str(name) for name in data["names"]]
     # Initialize the cols by instantiating the carrays
     for name, dir_ in data["dirs"].items():
         self._cols[str(name)] = bcolz.carray(rootdir=dir_, mode=self.mode)

Example #30

0

Show file

File: ctable.py Project: mrocklin/bquery

    def cache_factor(self, col_list, refresh=False):
        """
        Existing todos here are: these should be hidden helper carrays
        As in: not normal columns that you would normally see as a user

        The factor (label index) carray is as long as the original carray
        (and the rest of the table therefore)
        But the (unique) values carray is not as long (as long as the number
        of unique values)

        :param col_list:
        :param refresh:
        :return:
        """

        if not self.rootdir:
            raise TypeError('Only out-of-core ctables can have '
                            'factorization caching at the moment')

        if not isinstance(col_list, list):
            col_list = [col_list]

        for col in col_list:

            # create cache if needed
            if refresh or not self.cache_valid(col):
                col_rootdir = self[col].rootdir
                col_factor_rootdir = col_rootdir + '.factor'
                col_values_rootdir = col_rootdir + '.values'

                carray_factor = \
                    bcolz.carray([], dtype='int64', expectedlen=self.size,
                                   rootdir=col_factor_rootdir, mode='w')
                _, values = \
                    ctable_ext.factorize(self[col], labels=carray_factor)
                carray_factor.flush()

                carray_values = \
                    bcolz.carray(values.values(), dtype=self[col].dtype,
                                 rootdir=col_values_rootdir, mode='w')
                carray_values.flush()

Example #31

0

Show file

File: iterator.py Project: 1shekhar/bcolz-cpp

import numpy as np

import bcolz

N = 1e8  # the number of elements in x
clevel = 5  # the compression level
sexpr = "(x-1) < 10."  # the expression to compute
# sexpr = "((x-1) % 1000) == 0."  # the expression to compute
#sexpr = "(2*x**3+.3*y**2+z+1)<0"  # the expression to compute

cparams = bcolz.cparams(clevel)

print("Creating inputs...")

x = np.arange(N)
cx = bcolz.carray(x, cparams=cparams)
if 'y' not in sexpr:
    ct = bcolz.ctable((cx, ), names=['x'])
else:
    y = np.arange(N)
    z = np.arange(N)
    cy = bcolz.carray(y, cparams=cparams)
    cz = bcolz.carray(z, cparams=cparams)
    ct = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z'])

print("Evaluating...", sexpr)
t0 = time()
cbout = ct.eval(sexpr)
print("Time for evaluation--> %.3f" % (time() - t0, ))
print("Converting to numy arrays")
bout = cbout[:]

Example #32

0

Show file

    # then get the coordinates
    sx = 3 * i
    ex = 3 * (i + 1)
    xi = np.array(xyz[sx:ex])
    xyzi = np.stack([c for c in xi],
                    axis=1) / 100  # have to scale by 100 to match PDB

    # lastly convert the mask to indices
    msk_idx = np.map(np.array(list(masks[i])) == '+')[0]

    # bracket id or get "setting an array element with a sequence"
    zt = np.array([[id], seq, pssmi, xyzi, msk_idx])

    if i == 0:
        bc = bcolz.carray([zt],
                          rootdir=data_path + 'testing.bc',
                          mode='w',
                          expectedlen=len(ids))
        bc.flush()
    else:
        bc = bcolz.carray(rootdir=data_path + 'testing.bc', mode='w')
        bc.append([zt])
        bc.flush()

# %%
from pathlib import Path

home = str(Path.home())
pn_path = home + '/Downloads/casp7/casp7/testing'
# pn_path = os.curdir + '/../rgn_pytorch/data/text_sample'
dataset = ProteinNetDataset(pn_path)
trn_data = DataLoader(dataset, batch_size=32, shuffle=True)

Example #33

0

Show file

File: data_utils.py Project: zjjlivein/kaggle

def save_array(data_folder, fname, arr):
    fname = os.path.join(data_folder, fname)
    print("Saving to {0} ...".format(fname))
    c = bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()

Example #34

0

Show file

File: utils.py Project: thudzj/BayesAdapter

def load_dataset_face(args,
                      INPUT_SIZE=[112, 112],
                      RGB_MEAN=[0.5, 0.5, 0.5],
                      RGB_STD=[0.5, 0.5, 0.5],
                      val_datasets=[
                          'lfw', 'cfp_ff', 'cfp_fp', 'agedb_30', 'calfw',
                          'cplfw', 'vgg2_fp'
                      ]):
    train_transform = transforms.Compose([
        transforms.Resize(
            [int(128 * INPUT_SIZE[0] / 112),
             int(128 * INPUT_SIZE[0] / 112)]),  # smaller side resized
        transforms.RandomCrop([INPUT_SIZE[0], INPUT_SIZE[1]]),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=RGB_MEAN, std=RGB_STD),
    ])
    train_data = dset.ImageFolder(
        os.path.join(args.data_path, 'CASIA-maxpy-align'), train_transform)
    weights = torch.DoubleTensor(
        make_weights_for_balanced_classes(train_data.imgs,
                                          len(train_data.classes)))
    if args.distributed:
        from catalyst.data.sampler import DistributedSamplerWrapper
        train_sampler = DistributedSamplerWrapper(
            torch.utils.data.sampler.WeightedRandomSampler(
                weights, len(weights)))
    else:
        train_sampler = torch.utils.data.sampler.WeightedRandomSampler(
            weights, len(weights))
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(
            [int(128 * INPUT_SIZE[0] / 112),
             int(128 * INPUT_SIZE[0] / 112)]),
        transforms.CenterCrop([INPUT_SIZE[0], INPUT_SIZE[1]]),
        transforms.ToTensor(),
        transforms.Normalize(mean=RGB_MEAN, std=RGB_STD)
    ])
    val_loaders = []
    for name in val_datasets:
        carray = bcolz.carray(rootdir=os.path.join(args.data_path, name),
                              mode='r')
        val_data_tensor = torch.tensor(carray[:, [2, 1, 0], :, :]) * 0.5 + 0.5
        val_data = TensorsDataset(val_data_tensor, val_transform)
        val_loader = torch.utils.data.DataLoader(val_data,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 num_workers=args.workers,
                                                 pin_memory=True,
                                                 sampler=None)
        issame = np.load('{}/{}_list.npy'.format(args.data_path, name))
        val_loaders.append((name, val_loader, issame))

    return train_loader, val_loaders

Example #35

0

Show file

def fill(shape, dflt=None, dtype=np.float, **kwargs):
    """fill(shape, dtype=float, dflt=None, **kwargs)

    Return a new carray or ctable object of given shape and type, filled with
    `dflt`.

    Parameters
    ----------
    shape : int
        Shape of the new array, e.g., ``(2,3)``.
    dflt : Python or NumPy scalar
        The value to be used during the filling process.  If None, values are
        filled with zeros.  Also, the resulting carray will have this value as
        its `dflt` value.
    dtype : data-type, optional
        The desired data-type for the array, e.g., `numpy.int8`.  Default is
        `numpy.float64`.
    kwargs : list of parameters or dictionary
        Any parameter supported by the carray constructor.

    Returns
    -------
    out : carray or ctable
        Bcolz object filled with `dflt` values with the given shape and dtype.

    See Also
    --------
    ones, zeros

    """
    def fill_helper(obj, dtype=None, length=None):
        """Helper function to fill a carray with default values"""
        assert isinstance(obj, bcolz.carray)
        assert dtype is not None
        assert length is not None
        if type(length) is float:
            length = int(length)

        # Then fill it
        # We need an array for the default so as to keep the atom info
        dflt = np.array(obj.dflt, dtype=dtype.base)
        # Fill chunk with defaults
        chunk = np.empty(length, dtype=dtype)
        chunk[:] = dflt
        obj.append(chunk)
        obj.flush()

    dtype = np.dtype(dtype)
    if type(shape) in _inttypes + (float, ):
        shape = (int(shape), )
    else:
        shape = tuple(shape)
        if len(shape) > 1:
            # Multidimensional shape.
            # The atom will have shape[1:] dims (+ the dtype dims).
            dtype = np.dtype((dtype.base, shape[1:] + dtype.shape))
    length = shape[0]

    # Create the container
    expectedlen = kwargs.pop("expectedlen", length)
    if dtype.kind == "V" and dtype.shape == ():
        list_ca = []
        # force carrays to live in memory
        base_rootdir = kwargs.pop('rootdir', None)
        for name, col_dype in dtype.descr:
            dflt = np.zeros((), dtype=col_dype)
            ca = bcolz.carray([],
                              dtype=col_dype,
                              dflt=dflt,
                              expectedlen=expectedlen,
                              **kwargs)
            fill_helper(ca, dtype=ca.dtype, length=length)
            list_ca.append(ca)
        # bring rootdir back, ctable should live either on-disk or in-memory
        kwargs['rootdir'] = base_rootdir
        obj = bcolz.ctable(list_ca, names=dtype.names, **kwargs)
    else:
        obj = bcolz.carray([],
                           dtype=dtype,
                           dflt=dflt,
                           expectedlen=expectedlen,
                           **kwargs)
        fill_helper(obj, dtype=dtype, length=length)

    return obj

Example #36

0

Show file

    for ii in img_range:
        print('%d / %d' % (ii, img_range.shape[0]))
        llh_rpy = pva_interp(img_times[ii])
        lon_lat_h = llh_rpy[0:3]
        c_n_v = nu.rpy_to_cnb(*llh_rpy[3:])
        feat_df, desc = aie.extract_features(images[ii], lon_lat_h, c_n_v)
        center_wgs = tf.project_center(lon_lat_h, c_n_v).flatten()
        df_path = 'feat/df/feat_%d.hdf' % ii
        desc_path = 'feat/desc/desc_%d' % ii

        if feat_df is None:
            feat_meta.loc[ii] = [
                0, center_wgs[0], center_wgs[1], df_path, desc_path
            ]
        else:
            print("%d :: %d Feat" % (ii, desc.shape[0]))
            feat_meta.loc[ii] = [
                desc.shape[0], center_wgs[0], center_wgs[1], df_path, desc_path
            ]
            feat_df.to_hdf(os.path.join(out_path, df_path),
                           'feat_df',
                           mode='w',
                           format='table',
                           complib='zlib',
                           complevel=7)
            bcolz.carray(desc.astype(np.float32),
                         rootdir=os.path.join(out_path, desc_path),
                         mode='w').flush()

    feat_meta.to_hdf(os.path.join(out_path, 'feat_meta.hdf'), key='feat_meta')
    flight.close()

Example #37

0

Show file

File: process_pretrained_embeddings.py Project: jakpra/treeconstructive-supertagging

'''

import sys
import bcolz
import pickle
import numpy as np

glove_path = sys.argv[1]  # 'C:\\Users\\Jakob\\Downloads\\glove.840B.300d'
size = int(sys.argv[2])  # 840
dim = int(sys.argv[3])  # 300

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1),
                       rootdir=f'{glove_path}/{size}B.{dim}.dat',
                       mode='w')

with open(f'{glove_path}/glove.{size}B.{dim}d.txt', 'rb') as f:
    for l in f:
        line = l.split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

vectors = bcolz.carray(vectors[1:].reshape((-1, dim)),
                       rootdir=f'{glove_path}/{size}B.{dim}.dat',
                       mode='w')

Example #38

0

Show file

File: sum.py Project: 1shekhar/bcolz-cpp

from time import time

import numpy as np

import bcolz

N = 1e8
# a = np.arange(N, dtype='f8')
a = np.random.randint(0, 10, N).astype('bool')

t0 = time()
sa = a.sum()
print("Time sum() numpy --> %.3f" % (time() - t0))

t0 = time()
ac = bcolz.carray(a, cparams=bcolz.cparams(9))
print("Time carray conv --> %.3f" % (time() - t0))
print("ac-->", repr(ac))

t0 = time()
sac = ac.sum()
#sac = ac.sum(dtype=np.dtype('i8'))
print("Time sum() carray --> %.3f" % (time() - t0))

# t0 = time()
# sac = sum(i for i in ac)
# print "Time sum() carray (iter) --> %.3f" % (time()-t0)

print("sa, sac-->", sa, sac, type(sa), type(sac))
assert (sa == sac)

Example #39

0

Show file

File: chunked_eval.py Project: hdfeos/bcolz

def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor,
                 **kwargs):
    """Perform the evaluation in blocks."""

    # Compute the optimal block size (in elements)
    # The next is based on experiments with bench/ctable-query.py
    # and the 'movielens-bench' repository
    if vm == "numexpr":
        bsize = 2**24
    else:
        bsize = 2**22
    bsize //= typesize
    # Evaluation seems more efficient if block size is a power of 2
    bsize = 2 ** (int(math.log(bsize, 2)))
    if vlen < 100*1000:
        bsize //= 8
    elif vlen < 1000*1000:
        bsize //= 4
    elif vlen < 10*1000*1000:
        bsize //= 2
    # Protection against too large atomsizes
    if bsize == 0:
        bsize = 1

    vars_ = {}
    # Get temporaries for vars
    maxndims = 0
    for name in vars:
        var = vars[name]
        if hasattr(var, "__len__"):
            ndims = len(var.shape) + len(var.dtype.shape)
            if ndims > maxndims:
                maxndims = ndims
            if len(var) > bsize and hasattr(var, "_getrange"):
                vars_[name] = np.empty(bsize, dtype=var.dtype)

    for i in xrange(0, vlen, bsize):
        # Get buffers for vars
        for name in vars:
            var = vars[name]
            if hasattr(var, "__len__") and len(var) > bsize:
                if hasattr(var, "_getrange"):
                    if i+bsize < vlen:
                        var._getrange(i, bsize, vars_[name])
                    else:
                        vars_[name] = var[i:]
                else:
                    vars_[name] = var[i:i+bsize]
            else:
                if hasattr(var, "__getitem__"):
                    vars_[name] = var[:]
                else:
                    vars_[name] = var

        # Perform the evaluation for this block
        if vm == "python":
            res_block = _eval(expression, vars_)
        else:
            try:
                res_block = bcolz.numexpr.evaluate(expression,
                                                   local_dict=vars_)
            except ValueError:
                # numexpr cannot handle this. Fall back to a pure "python" VM.
                return _eval_blocks(
                    expression, vars, vlen, typesize, "python",
                    out_flavor, **kwargs)

        if i == 0:
            # Detection of reduction operations
            scalar = False
            dim_reduction = False
            if len(res_block.shape) == 0:
                scalar = True
                result = res_block
                continue
            elif len(res_block.shape) < maxndims:
                dim_reduction = True
                result = res_block
                continue
            # Get a decent default for expectedlen
            if out_flavor == "carray":
                nrows = kwargs.pop('expectedlen', vlen)
                result = bcolz.carray(res_block, expectedlen=nrows, **kwargs)
            else:
                out_shape = list(res_block.shape)
                out_shape[0] = vlen
                result = np.empty(out_shape, dtype=res_block.dtype)
                result[:bsize] = res_block
        else:
            if scalar or dim_reduction:
                result += res_block
            elif out_flavor == "carray":
                result.append(res_block)
            else:
                result[i:i+bsize] = res_block

    if isinstance(result, bcolz.carray):
        result.flush()
    if scalar:
        return result[()]
    return result

Example #40

0

Show file

def test_load_array(tempdir):
    rootdir = tempdir.path
    bcolz.carray(np.arange(0, 5), mode='w', rootdir=rootdir)

    array = core.load_array(rootdir)
    np.testing.assert_equal(array, [0, 1, 2, 3, 4])

Example #41

0

Show file

def get_val_pair(path, name):
    rootdir = os.path.join(path, name)
    carray = bcolz.carray(rootdir=rootdir, mode='r')
    np_path = os.path.join(path, '{}_list.npy'.format(name))
    issame = np.load(np_path)
    return carray, issame

Example #42

0

Show file

This script processes and generates GloVe embeddings
'''
# coding: utf-8

import pickle
from preprocess import Vocabulary
import numpy as np
import json
from scipy import misc
import bcolz

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir='glove.6B/6B.300.dat', mode='w')

with open('glove.6B/glove.6B.300d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

vectors = bcolz.carray(vectors[1:].reshape((400000, 300)),
                       rootdir='glove.6B/6B.300.dat',
                       mode='w')
vectors.flush()

Example #43

0

Show file

def fromiter(iterable, dtype, count, **kwargs):
    """
    fromiter(iterable, dtype, count, **kwargs)

    Create a carray/ctable from an `iterable` object.

    Parameters
    ----------
    iterable : iterable object
        An iterable object providing data for the carray.
    dtype : numpy.dtype instance
        Specifies the type of the outcome object.
    count : int
        The number of items to read from iterable. If set to -1, means that
        the iterable will be used until exhaustion (not recommended, see note
        below).
    kwargs : list of parameters or dictionary
        Any parameter supported by the carray/ctable constructors.

    Returns
    -------
    out : a carray/ctable object

    Notes
    -----
    Please specify `count` to both improve performance and to save memory.  It
    allows `fromiter` to avoid looping the iterable twice (which is slooow).
    It avoids memory leaks to happen too (which can be important for large
    iterables).

    """
    # Check for a true iterable
    if not hasattr(iterable, "next"):
        iterable = iter(iterable)

    # Try to guess the final length
    expected = count
    if count == -1:
        # Try to guess the size of the iterable length
        if hasattr(iterable, "__length_hint__"):
            count = iterable.__length_hint__()
            expected = count

    # First, create the container
    expectedlen = kwargs.pop("expectedlen", expected)
    dtype = np.dtype(dtype)
    if dtype.kind == "V":
        # A ctable
        obj = bcolz.ctable(np.array([], dtype=dtype),
                           expectedlen=expectedlen,
                           **kwargs)
        chunklen = sum(obj.cols[name].chunklen
                       for name in obj.names) // len(obj.names)
    else:
        # A carray
        obj = bcolz.carray(np.array([], dtype=dtype),
                           expectedlen=expectedlen,
                           **kwargs)
        chunklen = obj.chunklen

    # Then fill it
    while True:
        chunk = np.fromiter(it.islice(iterable, chunklen), dtype=dtype)
        if len(chunk) == 0:
            # Iterable has been exhausted
            break
        obj.append(chunk)
    obj.flush()
    return obj

Example #44

0

Show file

File: all_layer_train_mobilenet.py Project: tutysara/mlnd-capstone-project

log = logging.getLogger(__name__)

log.debug("fine tune all layers")
log.debug("using all_model_weight_path :" + model_path)
log.debug("using test_result :" + test_result)
log.debug("using loss_history_csv_name :" + loss_history_csv_name)

train_name = basedir + '/pp_train_data'
valid_name = basedir + '/pp_valid_data'
test_name = basedir + '/pp_test_data'

temp_dir = "/tmp/"

## load original bcolz data from disk
# read from disk and check size
valid_data = bcolz.carray(rootdir=valid_name + '_data.bclz', mode='r')
test_data = bcolz.carray(rootdir=test_name + '_data.bclz', mode='r')
train_data = bcolz.carray(rootdir=train_name + '_data.bclz', mode='r')

valid_labels = bcolz.carray(rootdir=valid_name + '_labels.bclz', mode='r')
test_labels = bcolz.carray(rootdir=test_name + '_labels.bclz', mode='r')
train_labels = bcolz.carray(rootdir=train_name + '_labels.bclz', mode='r')

log.debug("loading original data from disk")
log.debug(valid_data.shape)
log.debug(test_data.shape)
log.debug(train_data.shape)

log.debug(valid_labels.shape)
log.debug(test_labels.shape)
log.debug(train_labels.shape)

Example #45

0

Show file

def arange(start=None, stop=None, step=None, dtype=None, **kwargs):
    """
    arange([start,] stop[, step,], dtype=None, **kwargs)

    Return evenly spaced values within a given interval.

    Values are generated within the half-open interval ``[start, stop)``
    (in other words, the interval including `start` but excluding `stop`).
    For integer arguments the function is equivalent to the Python built-in
    `range <http://docs.python.org/lib/built-in-funcs.html>`_ function,
    but returns a carray rather than a list.

    Parameters
    ----------
    start : number, optional
        Start of interval.  The interval includes this value.  The default
        start value is 0.
    stop : number
        End of interval.  The interval does not include this value.
    step : number, optional
        Spacing between values.  For any output `out`, this is the distance
        between two adjacent values, ``out[i+1] - out[i]``.  The default
        step size is 1.  If `step` is specified, `start` must also be given.
    dtype : dtype
        The type of the output array.  If `dtype` is not given, infer the data
        type from the other input arguments.
    kwargs : list of parameters or dictionary
        Any parameter supported by the carray constructor.

    Returns
    -------
    out : carray
        Bcolz object made of evenly spaced values.

        For floating point arguments, the length of the result is
        ``ceil((stop - start)/step)``.  Because of floating point overflow,
        this rule may result in the last element of `out` being greater
        than `stop`.

    """

    # Check start, stop, step values
    if (start, stop) == (None, None):
        raise ValueError("You must pass a `stop` value at least.")
    elif stop is None:
        start, stop = 0, start
    elif start is None:
        start, stop = 0, stop
    if step is None:
        step = 1

    # Guess the dtype
    if dtype is None:
        if type(stop) in _inttypes:
            dtype = np.dtype(np.int_)
    dtype = np.dtype(dtype)
    stop = int(stop)

    # Create the container
    expectedlen = kwargs.pop("expectedlen", stop)
    if dtype.kind == "V":
        raise ValueError("arange does not support ctables yet.")
    else:
        obj = bcolz.carray(np.array([], dtype=dtype),
                           expectedlen=expectedlen,
                           **kwargs)
        chunklen = obj.chunklen

    # Then fill it
    incr = chunklen * step  # the increment for each chunk
    incr += step - (incr % step)  # make it match step boundary
    bstart, bstop = start, start + incr
    while bstart < stop:
        if bstop > stop:
            bstop = stop
        chunk = np.arange(bstart, bstop, step, dtype=dtype)
        obj.append(chunk)
        bstart = bstop
        bstop += incr
    obj.flush()
    return obj

Example #46

0

Show file

File: akmtdfgen.py Project: zitorelova/hackdays-finale

def test_generator():
    """Simple function to test return behavior of generator code above.

    This runs with and without merged model version.

df_train:
      object_id                                imgpath  target                          orig label
7          1518  /tmp/path/to/imgs/518/01/dog_1518.jpg       1  data/train/dogs/dog.1518.jpg   dog
1113       1662  /tmp/path/to/imgs/662/01/cat_1662.jpg       0  data/train/cats/cat.1662.jpg   cat
980        1409  /tmp/path/to/imgs/409/01/dog_1409.jpg       1  data/train/dogs/dog.1409.jpg   dog
1615       1813  /tmp/path/to/imgs/813/01/cat_1813.jpg       0  data/train/cats/cat.1813.jpg   cat
1029       1760  /tmp/path/to/imgs/760/01/cat_1760.jpg       0  data/train/cats/cat.1760.jpg   cat
df_valid:

     object_id                                imgpath  target                               orig label
787       7747  /tmp/path/to/imgs/747/07/cat_7747.jpg       0  data/validation/cats/cat.7747.jpg   cat
165       7563  /tmp/path/to/imgs/563/07/dog_7563.jpg       1  data/validation/dogs/dog.7563.jpg   dog
749       7517  /tmp/path/to/imgs/517/07/cat_7517.jpg       0  data/validation/cats/cat.7517.jpg   cat
458       7742  /tmp/path/to/imgs/742/07/cat_7742.jpg       0  data/validation/cats/cat.7742.jpg   cat
225       7479  /tmp/path/to/imgs/479/07/dog_7479.jpg       1  data/validation/dogs/dog.7479.jpg   dog

    """

    pd.np.set_printoptions(linewidth=150)

    df_train, df_valid = get_demo_data()

    img_width, img_height = 150, 150
    batch_size = 64
    target_size = (img_width, img_height)

    print("\nTest basic generator.\n")
    for df in (df_train, df_valid):
        i = 0
        for X, Y in generator_from_df(df,
                                      batch_size,
                                      target_size,
                                      features=None):
            print(X[:3, :3, 0])
            print(Y[:3])
            i += 1
            if i > 1:
                break

    # Create random array for bcolz test.
    #
    # In the end, this test does not use bcolz.
    # But, if it did, here are some hints to get you there.
    print("\nTest merged generator.\n")

    nfeatures = 74

    # features_train = pd.np.random.randn(df_train.shape[0], nfeatures)
    # features_valid = pd.np.random.randn(df_valid.shape[0], nfeatures)

    # Make a 2D array, where each row is filled with the values of its
    # index, which will be very convenient for testing the merged
    # model generator.
    # [[0, 0, 0, ...],
    #  [1, 1, 1, ...],
    #  [2, 2, 2, ...],
    #  ...
    # ]
    features_train = np.repeat(np.arange(df_train.shape[0],
                                         dtype=float).reshape((-1, 1)),
                               nfeatures,
                               axis=1)
    features_valid = np.repeat(np.arange(df_valid.shape[0],
                                         dtype=float).reshape((-1, 1)),
                               nfeatures,
                               axis=1)

    # Add a litle noise in [0, 1] just to pretend we have "real" data.
    features_train += np.random.rand(*features_train.shape)
    features_valid += np.random.rand(*features_valid.shape)

    fname_train = "mm_features_train_bc"
    if not os.path.exists(fname_train):
        c = bcolz.carray(features_train, rootdir=fname_train, mode='w')
        c.flush()

    fname_valid = "mm_features_valid_bc"
    if not os.path.exists(fname_valid):
        c = bcolz.carray(features_valid, rootdir=fname_valid, mode='w')
        c.flush()

    # Big assumption here: each row of a features matrix corresponds
    # exactly with the image represented by the row of the associated
    # train or valid df.  *YOU* will have to ensure this in your own
    # code.  This is only demo code!

    for df, fname in ((df_train, fname_train), (df_valid, fname_valid)):

        nbatches = df.shape[0] / float(batch_size)

        for i, ((X, features), Y) in enumerate(
                generator_from_df(df,
                                  batch_size,
                                  target_size,
                                  features=fname,
                                  debug_merged=True)):

            if i == 0:
                print(X[:3, :3, 0])
                print(features[:3, :5])
                print(Y[:3])
            else:
                if (i + 1) % 20 == 0:
                    print("%d / %d" % (i + i, nbatches), end=', ')
                    sys.stdout.flush()

            # Keras automatically breaks out of the infinite "while 1"
            # loop in the generator_from_df().  For this test, we need
            # to break manually.
            if i >= nbatches:
                break

    print("\nSuccessful (I think...) test of multithreaded read of bcolz!")

    print("Note that for this test, all of the above X2 rows should"\
          "have the same int() values within a row.")

Example #47

0

Show file

File: script_create_word_embeddings.py Project: CodeValkyrie/dialogue-act-sequences

import bcolz
import numpy as np
import pickle
import pandas as pd
import json

''' This file creates the matrix that is needed to convert the words in the data set to word embedding vectors. The 
    word embedder used is GloVe.
'''

glove_path = "glove_6B"

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/6B.50.dat', mode='w')

with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'{glove_path}/6B.50.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb'))

Example #48

0

Show file

File: verification_lfw_agedb_cfp.py Project: BossunWang/Probabilistic-Face-Embeddings

def get_val_pair(path, name):
    carray = bcolz.carray(rootdir=os.path.join(path, name), mode='r')
    issame = np.load('{}/{}_list.npy'.format(path, name))

    return carray, issame

Example #49

0

Show file

def save_array(fname, arr):
    " save np matrix or array"
    c = bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()

Example #50

0

Show file

File: us_equity_pricing.py Project: xujun05/nzl

    def _write_internal(self, iterator, assets):
        """
        Internal implementation of write.

        `iterator` should be an iterator yielding pairs of (asset, ctable).
        """
        total_rows = 0
        first_row = {}
        last_row = {}
        calendar_offset = {}

        # Maps column name -> output carray.
        columns = {
            k: carray(array([], dtype=uint32))
            for k in US_EQUITY_PRICING_BCOLZ_COLUMNS
        }

        earliest_date = None
        sessions = self._calendar.sessions_in_range(self._start_session,
                                                    self._end_session)

        if assets is not None:

            @apply
            def iterator(iterator=iterator, assets=set(assets)):
                for asset_id, table in iterator:
                    if asset_id not in assets:
                        raise ValueError('unknown asset id %r' % asset_id)
                    yield asset_id, table

        count = 0
        for asset_id, table in iterator:
            nrows = len(table)
            for column_name in columns:
                if column_name == 'id':
                    # We know what the content of this column is, so don't
                    # bother reading it.
                    columns['id'].append(
                        full((nrows, ), asset_id, dtype='uint32'), )
                    continue

                columns[column_name].append(table[column_name])

            if earliest_date is None:
                earliest_date = table["day"][0]
            else:
                earliest_date = min(earliest_date, table["day"][0])

            # Bcolz doesn't support ints as keys in `attrs`, so convert
            # assets to strings for use as attr keys.
            asset_key = str(asset_id)

            # Calculate the index into the array of the first and last row
            # for this asset. This allows us to efficiently load single
            # assets when querying the data back out of the table.
            first_row[asset_key] = total_rows
            last_row[asset_key] = total_rows + nrows - 1
            total_rows += nrows

            table_day_to_session = compose(
                self._calendar.minute_to_session_label,
                partial(Timestamp, unit='s', tz='UTC'),
            )
            asset_first_day = table_day_to_session(table['day'][0])
            asset_last_day = table_day_to_session(table['day'][-1])

            asset_sessions = sessions[sessions.slice_indexer(
                asset_first_day, asset_last_day)]
            assert len(table) == len(asset_sessions), (
                'Got {} rows for daily bars table with first day={}, last '
                'day={}, expected {} rows.\n'
                'Missing sessions: {}\n'
                'Extra sessions: {}'.format(
                    len(table),
                    asset_first_day.date(),
                    asset_last_day.date(),
                    len(asset_sessions),
                    asset_sessions.difference(
                        to_datetime(
                            np.array(table['day']),
                            unit='s',
                            utc=True,
                        )).tolist(),
                    to_datetime(
                        np.array(table['day']),
                        unit='s',
                        utc=True,
                    ).difference(asset_sessions).tolist(),
                ))

            # Calculate the number of trading days between the first date
            # in the stored data and the first date of **this** asset. This
            # offset used for output alignment by the reader.
            calendar_offset[asset_key] = sessions.get_loc(asset_first_day)
            count = count + 1

        if count == 0:
            return

        # This writes the table to disk.
        full_table = ctable(
            columns=[
                columns[colname] for colname in US_EQUITY_PRICING_BCOLZ_COLUMNS
            ],
            names=US_EQUITY_PRICING_BCOLZ_COLUMNS,
            rootdir=self._filename,
            mode='w',
        )

        full_table.attrs['first_trading_day'] = (earliest_date if earliest_date
                                                 is not None else iNaT)

        full_table.attrs['first_row'] = first_row
        full_table.attrs['last_row'] = last_row
        full_table.attrs['calendar_offset'] = calendar_offset
        full_table.attrs['calendar_name'] = self._calendar.name
        full_table.attrs['start_session_ns'] = self._start_session.value
        full_table.attrs['end_session_ns'] = self._end_session.value
        full_table.flush()
        return full_table

Example #51

0

Show file

    def _write_internal(self, iterator, assets):
        """
        Internal implementation of write.
        `iterator` should be an iterator yielding pairs of (asset, ctable).
        """
        total_rows = 0
        first_row = {}
        last_row = {}
        calendar_offset = {}

        # Maps column name -> output carray adding int64 for the greeks
        columns = {
            k: carray(
                array(
                    [],
                    dtype=(uint32_dtype if k not in GREEKS else int64_dtype)))
            for k in OPTION_PRICING_BCOLZ_COLUMNS
        }

        earliest_date = None
        sessions = self._calendar.sessions_in_range(self._start_session,
                                                    self._end_session)

        if assets is not None:

            @apply
            def iterator(iterator=iterator, assets=set(assets)):
                for asset_id, table in iterator:
                    if asset_id not in assets:
                        raise ValueError("unknown asset id %r" % asset_id)
                    yield asset_id, table

        for asset_id, table in iterator:

            logger.info(f"Writing asset id {asset_id} to disk")

            nrows = len(table)
            for column_name in columns:
                if column_name == "id":
                    # We know what the content of this column is, so don't
                    # bother reading it.
                    columns["id"].append(
                        full((nrows, ), asset_id, dtype="uint32"))
                    continue

                columns[column_name].append(table[column_name])

            if earliest_date is None:
                earliest_date = table["day"][0]
            else:
                earliest_date = min(earliest_date, table["day"][0])

            # Bcolz doesn't support ints as keys in `attrs`, so convert
            # assets to strings for use as attr keys.
            asset_key = str(asset_id)

            # Calculate the index into the array of the first and last row
            # for this asset. This allows us to efficiently load single
            # assets when querying the data back out of the table.
            first_row[asset_key] = total_rows
            last_row[asset_key] = total_rows + nrows - 1
            total_rows += nrows

            table_day_to_session = compose(
                self._calendar.minute_to_session_label,
                partial(Timestamp, unit="s", tz="UTC"),
            )
            asset_first_day = table_day_to_session(table["day"][0])
            asset_last_day = table_day_to_session(table["day"][-1])

            asset_sessions = sessions[sessions.slice_indexer(
                asset_first_day, asset_last_day)]
            assert len(table) == len(asset_sessions), (
                "Got {} rows for daily bars table with first day={}, last "
                "day={}, expected {} rows.\n"
                "Missing sessions: {}\n"
                "Extra sessions: {}".format(
                    len(table),
                    asset_first_day.date(),
                    asset_last_day.date(),
                    len(asset_sessions),
                    asset_sessions.difference(
                        to_datetime(np.array(table["day"]), unit="s",
                                    utc=True)).tolist(),
                    to_datetime(np.array(table["day"]), unit="s",
                                utc=True).difference(asset_sessions).tolist(),
                ))

            # Calculate the number of trading days between the first date
            # in the stored data and the first date of **this** asset. This
            # offset used for output alignment by the reader.
            calendar_offset[asset_key] = sessions.get_loc(asset_first_day)

        logger.info("Writing complete table to disk")
        # This writes the table to disk.
        full_table = ctable(
            columns=[
                columns[colname] for colname in OPTION_PRICING_BCOLZ_COLUMNS
            ],
            names=OPTION_PRICING_BCOLZ_COLUMNS,
            rootdir=self._filename,
            mode="w",
        )

        full_table.attrs["first_trading_day"] = (earliest_date if earliest_date
                                                 is not None else iNaT)

        full_table.attrs["first_row"] = first_row
        full_table.attrs["last_row"] = last_row
        full_table.attrs["calendar_offset"] = calendar_offset
        full_table.attrs["calendar_name"] = self._calendar.name
        full_table.attrs["start_session_ns"] = self._start_session.value
        full_table.attrs["end_session_ns"] = self._end_session.value
        full_table.flush()
        return full_table

Example #52

0

Show file

def save_array(f, arr):
    try_mkdir(os.path.dirname(f))
    c = bcolz.carray(arr, rootdir=f, mode='w')
    c.flush()

Example #53

0

Show file

File: data_pipe.py Project: a686432/DFace

def get_val_pair(path, name):
    carray = bcolz.carray(rootdir=path / name, mode="r")
    issame = np.load(path / "{}_list.npy".format(name))
    return carray, issame

Example #54

0

Show file

def save_array(fname, arr):
    c = bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()

Example #55

0

Show file

File: bcolz.py Project: quasiben/odo

def convert_numpy_to_bcolz_carray(x, **kwargs):
    return carray(x, **keyfilter(keywords.__contains__, kwargs))

Example #56

0

Show file

File: utils.py Project: ellatorfa/nbs

def save_array(fname, arr):
    bcolz.carray(arr, rootdir=fname, mode='w')

Example #57

0

Show file

File: test_bcolz.py Project: a686432/DFace

import bcolz
import os
import torch
import numpy as np
from PIL import Image

path = "/ssd-data/lmd/eval_dbs"

names = ["agedb_30"]
for name in names:
    carray = bcolz.carray(rootdir=os.path.join(path, name), mode="r")
    print(carray.shape)
    print(carray[-1].transpose(1, 2, 0).shape)
    print((carray[-1].transpose(1, 2, 0))[55:65, 55:65])
    img = Image.fromarray((carray[-1].transpose(1, 2, 0).astype(np.float32) *
                           255).astype(np.uint8))
    img.save("/data2/lmd_jdq/cfp-fp/%d.jpg" % 0)
    for i in range(1, 20):
        print(np.sum(carray[-i] - carray[-1]))

Example #58

0

Show file

File: dataloader-workingpaper.py Project: nilslager/Git-it-up

#Data loader
import cython
import numpy as np
import bcolz
import pickle
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.autograd import Variable

#Dataloader
words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1),
                       rootdir=f'/Users/nilslager/Desktop/gitit.50.dat',
                       mode='w')

#Open up GloVe embeddings and create vectors
with open(f'/Users/nilslager/Desktop/wv_50d_gitit.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

#Construct pickle files
vectors = bcolz.carray(vectors[1:].reshape((400000, 50)),

Example #59

0

Show file

def save_array(data, fname):
    print("Saving image dataset at the location " + str(fname) + ".")
    c = bcolz.carray(data, rootdir=fname, mode='w')
    c.flush()

Example #60

0

Show file

File: utils.py Project: markdyousef/neural-style

def save_array(fname, arr):
    c = bz.carray(arr, rootdir=fname, mode=w)
    c.flush()