Exemple #1
0
def promote(lhs, rhs):
    """Promote two scalar dshapes to a possibly larger, but compatible type.

    Examples
    --------
    >>> from datashape import int32, int64, Option
    >>> x = Option(int32)
    >>> y = int64
    >>> promote(x, y)
    ?int64
    >>> promote(int64, int64)
    ctype("int64")

    Notes
    ----
    This uses ``numpy.result_type`` for type promotion logic.  See the numpy
    documentation at
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.result_type.html
    """
    if lhs == rhs:
        return lhs
    else:
        left, right = getattr(lhs, "ty", lhs), getattr(rhs, "ty", rhs)
        dtype = np.result_type(datashape.to_numpy_dtype(left), datashape.to_numpy_dtype(right))
        return optionify(lhs, rhs, datashape.CType.from_numpy_dtype(dtype))
Exemple #2
0
def into(a, b):
    schema = dshape(str(b.schema).replace('?', ''))
    if b.iscolumn:
        return into(np.ndarray(0), compute(b),
                dtype=to_numpy_dtype(schema[0].types[0]))
    else:
        return into(np.ndarray(0), compute(b), dtype=to_numpy_dtype(schema))
Exemple #3
0
def promote(lhs, rhs, promote_option=True):
    """Promote two scalar dshapes to a possibly larger, but compatible type.

    Examples
    --------
    >>> from datashape import int32, int64, Option
    >>> x = Option(int32)
    >>> y = int64
    >>> promote(x, y)
    Option(ty=ctype("int64"))
    >>> promote(int64, int64)
    ctype("int64")

    Don't promote to option types.
    >>> promote(x, y, promote_option=False)
    ctype("int64")

    Notes
    ----
    This uses ``numpy.result_type`` for type promotion logic.  See the numpy
    documentation at
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.result_type.html
    """
    if lhs == rhs:
        return lhs
    else:
        left, right = getattr(lhs, 'ty', lhs), getattr(rhs, 'ty', rhs)
        dtype = datashape.CType.from_numpy_dtype(
            np.result_type(
                datashape.to_numpy_dtype(left),
                datashape.to_numpy_dtype(right),
            ), )
        if promote_option:
            dtype = optionify(lhs, rhs, dtype)
        return dtype
Exemple #4
0
def into(a, b):
    schema = dshape(str(b.schema).replace('?', ''))
    if b.iscolumn:
        return into(np.ndarray(0),
                    compute(b),
                    dtype=to_numpy_dtype(schema[0].types[0]))
    else:
        return into(np.ndarray(0), compute(b), dtype=to_numpy_dtype(schema))
Exemple #5
0
def compute_up(t, x, **kwargs):
    # can't use the method here, as they aren't Python functions
    reducer = getattr(np, t.symbol)
    if 'dtype' in keywords(reducer):
        return reducer(x, axis=t.axis, keepdims=t.keepdims,
                       dtype=to_numpy_dtype(t.schema))
    return reducer(x, axis=t.axis, keepdims=t.keepdims)
Exemple #6
0
 def test_uints(self):
     types = ['uint8', 'uint16', 'uint32', 'uint64']
     for type_ in types:
         a = blaze.array(np.arange(3), dshape=type_)
         dtype = to_numpy_dtype(a.dshape)
         self.assertEqual(dtype, np.dtype(type_))
         self.assertEqual(dd_as_py(a._data), [0, 1, 2])
Exemple #7
0
def resource_bcolz(uri, dshape=None, expected_dshape=None, **kwargs):
    if os.path.exists(uri):
        try:
            return ctable(rootdir=uri)
        except IOError:  # __rootdirs__ doesn't exist because we aren't a ctable
            return carray(rootdir=uri)
    else:
        if not dshape:
            raise ValueError("Must specify either existing bcolz directory or"
                             " valid datashape")
        dshape = datashape.dshape(dshape)

        dt = datashape.to_numpy_dtype(dshape)
        shape_tail = tuple(map(int, dshape.shape[1:]))  # tail of shape
        if dshape.shape[0] == datashape.var:
            shape = (0,) + shape_tail
        else:
            shape = (int(dshape.shape[0]),) + shape_tail

        x = np.empty(shape=shape, dtype=dt)

        kwargs = keyfilter(keywords.__contains__, kwargs)
        expectedlen = kwargs.pop('expectedlen',
                                 int(expected_dshape[0])
                                 if expected_dshape is not None and
                                 isinstance(expected_dshape[0], datashape.Fixed)
                                 else None)

        if datashape.predicates.isrecord(dshape.measure):
            return ctable(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
        else:
            return carray(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
Exemple #8
0
def into(a, b, **kwargs):
    c = compute(b)
    if isinstance(c, (list, tuple, Iterator)):
        kwargs['types'] = [datashape.to_numpy_dtype(t) for t in
                b.schema[0].types]
        kwargs['names'] = b.columns
    return into(a, c, **kwargs)
Exemple #9
0
def compute_up(t, x, **kwargs):
    # can't use the method here, as they aren't Python functions
    reducer = getattr(np, t.symbol)
    if 'dtype' in keywords(reducer):
        return reducer(x, axis=t.axis, keepdims=t.keepdims,
                       dtype=to_numpy_dtype(t.schema))
    return reducer(x, axis=t.axis, keepdims=t.keepdims)
Exemple #10
0
def PyTables(path, datapath, dshape=None, **kwargs):
    """Create or open a ``tables.Table`` object.

    Parameters
    ----------
    path : str
        Path to a PyTables HDF5 file.
    datapath : str
        The name of the node in the ``tables.File``.
    dshape : str or datashape.DataShape
        DataShape to use to create the ``Table``.

    Returns
    -------
    t : tables.Table

    Examples
    --------
    >>> from blaze.utils import tmpfile
    >>> # create from scratch
    >>> with tmpfile('.h5') as f:
    ...     t = PyTables(filename, '/bar',
    ...                  dshape='var * {volume: float64, planet: string[10, "A"]}')
    ...     data = [(100.3, 'mars'), (100.42, 'jupyter')]
    ...     t.append(data)
    ...     t[:]  # doctest: +SKIP
    ...
    array([(100.3, b'mars'), (100.42, b'jupyter')],
          dtype=[('volume', '<f8'), ('planet', 'S10')])
    """
    def possibly_create_table(filename, dtype):
        f = tb.open_file(filename, mode='a')
        try:
            if datapath not in f:
                if dtype is None:
                    raise ValueError('dshape cannot be None and datapath not'
                                     ' in file')
                else:
                    f.create_table('/',
                                   datapath.lstrip('/'),
                                   description=dtype)
        finally:
            f.close()

    if dshape:
        if isinstance(dshape, str):
            dshape = datashape.dshape(dshape)
        if dshape[0] == datashape.var:
            dshape = dshape.subshape[0]
        dtype = dtype_to_pytables(datashape.to_numpy_dtype(dshape))
    else:
        dtype = None

    if os.path.exists(path):
        possibly_create_table(path, dtype)
    else:
        with tmpfile('.h5') as filename:
            possibly_create_table(filename, dtype)
            shutil.copyfile(filename, path)
    return tb.open_file(path, mode='a').get_node(datapath)
Exemple #11
0
 def test_uints(self):
     types = ['uint8', 'uint16', 'uint32', 'uint64']
     for type_ in types:
         a = blaze.array(np.arange(3), dshape=type_)
         dtype = to_numpy_dtype(a.dshape)
         self.assertEqual(dtype, np.dtype(type_))
         self.assertEqual(dd_as_py(a._data), [0, 1, 2])
Exemple #12
0
def into(a, b, **kwargs):
    c = compute(b)
    if isinstance(c, (list, tuple, Iterator)):
        kwargs['types'] = [
            datashape.to_numpy_dtype(t) for t in b.schema[0].types
        ]
        kwargs['names'] = b.columns
    return into(a, c, **kwargs)
Exemple #13
0
def compute_up(expr, data, **kwargs):
    measure = expr.to.measure
    if measure in {datashape.string, datashape.Option(datashape.string)}:
        return data.astype(str)
    elif measure in {datashape.datetime_,
                     datashape.Option(datashape.datetime_)}:
        return data.astype(np.datetime64)
    return data.astype(to_numpy_dtype(expr.schema))
Exemple #14
0
def compute_up(t, x, **kwargs):
    result_dtype = to_numpy_dtype(t.dshape)
    if issubclass(x.dtype.type, (np.floating, np.object_)):
        return pd.notnull(x).sum(keepdims=t.keepdims, axis=t.axis, dtype=result_dtype)
    elif issubclass(x.dtype.type, np.datetime64):
        return (x.view("int64") != inat).sum(keepdims=t.keepdims, axis=t.axis, dtype=result_dtype)
    else:
        return np.ones(x.shape, dtype=result_dtype).sum(keepdims=t.keepdims, axis=t.axis, dtype=result_dtype)
Exemple #15
0
 def test_complex(self):
     types = ['complex64', 'complex128']
     for type_ in types:
         a = blaze.array(np.arange(3), dshape=type_)
         dtype = to_numpy_dtype(a.dshape)
         self.assertEqual(dtype, np.dtype(type_))
         # dd_as_py does not support complexes yet..
         self.assertEqual(dd_as_py(a._data), [0, 1, 2])
Exemple #16
0
 def test_complex(self):
     types = ['complex64', 'complex128']
     for type_ in types:
         a = blaze.array(np.arange(3), dshape=type_)
         dtype = to_numpy_dtype(a.dshape)
         self.assertEqual(dtype, np.dtype(type_))
         # dd_as_py does not support complexes yet..
         self.assertEqual(dd_as_py(a._data), [0, 1, 2])
Exemple #17
0
def into(a, b, **kwargs):
    names = dshape(nd.dshape_of(b))[1].names
    columns = [getattr(b, name) for name in names]
    columns = [np.asarray(nd.as_py(c))
            if to_numpy_dtype(dshape(nd.dshape_of(c))) == np.dtype('O')
            else into(np.ndarray(0), c) for c in columns]

    return bcolz.ctable(columns, names=names, **kwargs)
Exemple #18
0
def into(a, df, **kwargs):
    x = df.to_records(index=False)
    if 'dshape' in kwargs:
        ds = dshape(kwargs['dshape']).measure
        dt = to_numpy_dtype(ds)
        if x.dtype != dt:
            x = x.astype(dt)
    return x
Exemple #19
0
def compute_up(expr, data, **kwargs):
    if datashape.dshape(expr.to) in {datashape.string,
                                     datashape.Option(datashape.string)}:
        return data.astype(str)
    elif datashape.dshape(expr.to) in {datashape.datetime_,
                                       datashape.Option(datashape.datetime_)}:
        return data.astype(np.datetime64)
    return data.astype(to_numpy_dtype(expr.schema))
Exemple #20
0
def PyTables(path, datapath, dshape=None, **kwargs):
    """Create or open a ``tables.Table`` object.

    Parameters
    ----------
    path : str
        Path to a PyTables HDF5 file.
    datapath : str
        The name of the node in the ``tables.File``.
    dshape : str or datashape.DataShape
        DataShape to use to create the ``Table``.

    Returns
    -------
    t : tables.Table

    Examples
    --------
    >>> from blaze.utils import tmpfile
    >>> # create from scratch
    >>> with tmpfile('.h5') as f:
    ...     t = PyTables(filename, '/bar',
    ...                  dshape='var * {volume: float64, planet: string[10, "A"]}')
    ...     data = [(100.3, 'mars'), (100.42, 'jupyter')]
    ...     t.append(data)
    ...     t[:]  # doctest: +SKIP
    ...
    array([(100.3, b'mars'), (100.42, b'jupyter')],
          dtype=[('volume', '<f8'), ('planet', 'S10')])
    """
    def possibly_create_table(filename, dtype):
        f = tb.open_file(filename, mode='a')
        try:
            if datapath not in f:
                if dtype is None:
                    raise ValueError('dshape cannot be None and datapath not'
                                     ' in file')
                else:
                    f.create_table('/', datapath.lstrip('/'), description=dtype)
        finally:
            f.close()

    if dshape:
        if isinstance(dshape, str):
            dshape = datashape.dshape(dshape)
        if dshape[0] == datashape.var:
            dshape = dshape.subshape[0]
        dtype = dtype_to_pytables(datashape.to_numpy_dtype(dshape))
    else:
        dtype = None

    if os.path.exists(path):
        possibly_create_table(path, dtype)
    else:
        with tmpfile('.h5') as filename:
            possibly_create_table(filename, dtype)
            shutil.copyfile(filename, path)
    return tb.open_file(path, mode='a').get_node(datapath)
Exemple #21
0
def promote(lhs, rhs, promote_option=True):
    """Promote two scalar dshapes to a possibly larger, but compatible type.

    Examples
    --------
    >>> from datashape import int32, int64, Option, string
    >>> x = Option(int32)
    >>> y = int64
    >>> promote(x, y)
    Option(ty=ctype("int64"))
    >>> promote(int64, int64)
    ctype("int64")

    Don't promote to option types.
    >>> promote(x, y, promote_option=False)
    ctype("int64")

    Strings are handled differently than NumPy, which promotes to ctype("object")
    >>> x = string
    >>> y = Option(string)
    >>> promote(x, y) == promote(y, x) == Option(string)
    True
    >>> promote(x, y, promote_option=False)
    ctype("string")

    Notes
    ----
    Except for ``datashape.string`` types, this uses ``numpy.result_type`` for
    type promotion logic.  See the numpy documentation at:

    http://docs.scipy.org/doc/numpy/reference/generated/numpy.result_type.html
    """
    if lhs == rhs:
        return lhs
    left, right = getattr(lhs, 'ty', lhs), getattr(rhs, 'ty', rhs)
    if left == right == datashape.string:
        # Special case string promotion, since numpy promotes to `object`.
        dtype = datashape.string
    else:
        np_res_type = np.result_type(datashape.to_numpy_dtype(left),
                                     datashape.to_numpy_dtype(right))
        dtype = datashape.CType.from_numpy_dtype(np_res_type)
    if promote_option:
        dtype = optionify(lhs, rhs, dtype)
    return dtype
Exemple #22
0
 def test_floats(self):
     types = ['float16', 'float32', 'float64']
     for type_ in types:
         a = blaze.array(np.arange(3), dshape=type_)
         dtype = to_numpy_dtype(a.dshape)
         self.assertEqual(dtype, np.dtype(type_))
         if type_ != 'float16':
             # dd_as_py does not support this yet
             self.assertEqual(dd_as_py(a._data), [0, 1, 2])
Exemple #23
0
def into(a, b, **kwargs):
    names = dshape(nd.dshape_of(b))[1].names
    columns = [getattr(b, name) for name in names]
    columns = [
        np.asarray(nd.as_py(c)) if to_numpy_dtype(dshape(nd.dshape_of(c)))
        == np.dtype('O') else into(np.ndarray(0), c) for c in columns
    ]

    return bcolz.ctable(columns, names=names, **kwargs)
Exemple #24
0
 def test_floats(self):
     types = ['float16', 'float32', 'float64']
     for type_ in types:
         a = blaze.array(np.arange(3), dshape=type_)
         dtype = to_numpy_dtype(a.dshape)
         self.assertEqual(dtype, np.dtype(type_))
         if type_ != 'float16':
             # dd_as_py does not support this yet
             self.assertEqual(dd_as_py(a._data), [0, 1, 2])
Exemple #25
0
def promote(lhs, rhs, promote_option=True):
    """Promote two scalar dshapes to a possibly larger, but compatible type.

    Examples
    --------
    >>> from datashape import int32, int64, Option, string
    >>> x = Option(int32)
    >>> y = int64
    >>> promote(x, y)
    Option(ty=ctype("int64"))
    >>> promote(int64, int64)
    ctype("int64")

    Don't promote to option types.
    >>> promote(x, y, promote_option=False)
    ctype("int64")

    Strings are handled differently than NumPy, which promotes to ctype("object")
    >>> x = string
    >>> y = Option(string)
    >>> promote(x, y) == promote(y, x) == Option(string)
    True
    >>> promote(x, y, promote_option=False)
    ctype("string")

    Notes
    ----
    Except for ``datashape.string`` types, this uses ``numpy.result_type`` for
    type promotion logic.  See the numpy documentation at:

    http://docs.scipy.org/doc/numpy/reference/generated/numpy.result_type.html
    """
    if lhs == rhs:
        return lhs
    left, right = getattr(lhs, "ty", lhs), getattr(rhs, "ty", rhs)
    if left == right == datashape.string:
        # Special case string promotion, since numpy promotes to `object`.
        dtype = datashape.string
    else:
        np_res_type = np.result_type(datashape.to_numpy_dtype(left), datashape.to_numpy_dtype(right))
        dtype = datashape.CType.from_numpy_dtype(np_res_type)
    if promote_option:
        dtype = optionify(lhs, rhs, dtype)
    return dtype
Exemple #26
0
def resource_bcolz(rootdir, **kwargs):
    if os.path.exists(rootdir):
        kwargs = keyfilter(carray_keywords.__contains__, kwargs)
        return ctable(rootdir=rootdir, **kwargs)
    else:
        if 'dshape' in kwargs:
            dtype = to_numpy_dtype(kwargs['dshape'])
            kwargs = keyfilter(carray_keywords.__contains__, kwargs)
            return ctable(np.empty(0, dtype), rootdir=rootdir, **kwargs)
        else:
            raise ValueError("File does not exist and no `dshape=` given")
Exemple #27
0
def resource_bcolz(rootdir, **kwargs):
    if os.path.exists(rootdir):
        kwargs = keyfilter(keywords(ctable).__contains__, kwargs)
        return ctable(rootdir=rootdir, **kwargs)
    else:
        if 'dshape' in kwargs:
            dtype = to_numpy_dtype(kwargs['dshape'])
            kwargs = keyfilter(keywords(ctable).__contains__, kwargs)
            return ctable(np.empty(0, dtype), rootdir=rootdir, **kwargs)
        else:
            raise ValueError("File does not exist and no `dshape=` given")
Exemple #28
0
def compute_up(t, x, **kwargs):
    result_dtype = to_numpy_dtype(t.dshape)
    if issubclass(x.dtype.type, (np.floating, np.object_)):
        return pd.notnull(x).sum(keepdims=t.keepdims, axis=t.axis,
                                 dtype=result_dtype)
    elif issubclass(x.dtype.type, np.datetime64):
        return (x.view('int64') != inat).sum(keepdims=t.keepdims, axis=t.axis,
                                             dtype=result_dtype)
    else:
        return np.ones(x.shape, dtype=result_dtype).sum(keepdims=t.keepdims,
                                                        axis=t.axis,
                                                        dtype=result_dtype)
Exemple #29
0
def series_to_array(s, dshape=None, **kwargs):
    dtype = datashape.to_numpy_dtype(datashape.dshape(dshape))
    sdtype = s.dtype
    values = s.values

    # don't lose precision of datetime64 more precise than microseconds
    if ((issubclass(sdtype.type, np.datetime64) and
            np.datetime_data(sdtype)[0] in higher_precision_freqs)
            or s.dtype == dtype):
        return values
    try:
        return values.astype(dtype)
    except ValueError:  # object series and record dshape, e.g., a frame row
        return values
Exemple #30
0
    def __init__(self,
                 path,
                 datapath,
                 mode='r',
                 schema=None,
                 dshape=None,
                 **kwargs):
        self.path = path
        self.datapath = datapath
        self.mode = mode

        if schema and not dshape:
            dshape = 'var * ' + str(schema)

        # TODO: provide sane defaults for kwargs
        # Notably chunks and maxshape
        if dshape:
            dshape = datashape.dshape(dshape)
            shape = dshape.shape
            dtype = datashape.to_numpy_dtype(dshape[-1])
            if shape[0] == datashape.Var():
                kwargs['chunks'] = True
                kwargs['maxshape'] = kwargs.get('maxshape',
                                                (None, ) + shape[1:])
                shape = (0, ) + tuple(map(int, shape[1:]))

        with h5py.File(path, mode) as f:
            dset = f.get(datapath)
            if dset is None:
                if dshape is None:
                    raise ValueError('No dataset or dshape provided')
                else:
                    f.create_dataset(datapath, shape, dtype=dtype, **kwargs)
            else:
                dshape2 = datashape.from_numpy(dset.shape, dset.dtype)
                dshape = dshape2
                # TODO: test provided dshape against given dshape
                # if dshape and dshape != dshape2:
                #     raise ValueError('Inconsistent datashapes.'
                #             '\nGiven: %s\nFound: %s' % (dshape, dshape2))

        attributes = self.attributes()
        if attributes['chunks']:
            # is there a better way to do this?
            words = str(dshape).split(' * ')
            dshape = 'var * ' + ' * '.join(words[1:])
            dshape = datashape.dshape(dshape)

        self._dshape = dshape
        self._schema = schema
Exemple #31
0
def dataset_from_dshape(file, datapath, ds, **kwargs):
    dtype = varlen_dtype(to_numpy_dtype(ds))
    if datashape.var not in list(ds):
        shape = to_numpy(ds)[0]
    elif len(ds.shape) == 1:
        shape = (0,)
    else:
        raise ValueError("Don't know how to handle varlen nd shapes")

    if shape:
        kwargs['chunks'] = kwargs.get('chunks', True)
        kwargs['maxshape'] = kwargs.get('maxshape', (None,) + shape[1:])

    kwargs2 = keyfilter(h5py_attributes.__contains__, kwargs)
    return file.require_dataset(datapath, shape=shape, dtype=dtype, **kwargs2)
Exemple #32
0
def promote(lhs, rhs):
    """Promote two scalar dshapes to a possibly larger, but compatibile type



    Examples
    --------
    >>> from datashape import int32, int64, Option
    >>> x = Option(int32)
    >>> y = int64
    >>> promote(x, y)
    ?int64

    Notes
    ----
    This uses ``numpy.promote_types`` for type promotion logic.  See the numpy
    documentation at
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.promote_types.html
    """
    left, right = getattr(lhs, 'ty', lhs), getattr(rhs, 'ty', rhs)
    dtype = np.promote_types(datashape.to_numpy_dtype(left),
                             datashape.to_numpy_dtype(right))
    dshape = datashape.from_numpy((), dtype)
    return optionify(lhs, rhs, dshape)
Exemple #33
0
def dataset_from_dshape(file, datapath, ds, **kwargs):
    dtype = varlen_dtype(to_numpy_dtype(ds))

    if datashape.var not in list(ds):
        shape = to_numpy(ds)[0]
    elif datashape.var not in list(ds)[1:]:
        shape = (0, ) + to_numpy(ds.subshape[0])[0]
    else:
        raise ValueError("Don't know how to handle varlen nd shapes")

    if shape:
        kwargs['chunks'] = kwargs.get('chunks', True)
        kwargs['maxshape'] = kwargs.get('maxshape', (None, ) + shape[1:])

    kwargs2 = keyfilter(h5py_attributes.__contains__, kwargs)
    return file.require_dataset(datapath, shape=shape, dtype=dtype, **kwargs2)
Exemple #34
0
def resource_bcolz(uri, dshape=None, **kwargs):
    if os.path.exists(uri):
        return ctable(rootdir=uri)
    else:
        if not dshape:
            raise ValueError("Must specify either existing bcolz directory or"
                    "valid datashape")
        dshape = datashape.dshape(dshape)

        dt = datashape.to_numpy_dtype(dshape)
        x = np.empty(shape=(0,), dtype=dt)

        if datashape.predicates.isrecord(dshape.measure):
            return ctable(x, rootdir=uri, **keyfilter(keywords.__contains__, kwargs))
        else:
            return carray(x, rootdir=uri, **keyfilter(keywords.__contains__, kwargs))
Exemple #35
0
def compute_up(expr, ob, **kwargs):
    tp = expr.to
    shape = tp.shape
    if shape:
        raise TypeError(
            'cannot convert scalar object %r to array or matrix shape %r' % (
                ob,
                shape,
            ), )

    measure = tp.measure
    if isinstance(measure, Option):
        if pd.isnull(ob):
            return None
        measure = measure.ty
    dtype = to_numpy_dtype(measure)
    return dtype.type(ob)
Exemple #36
0
def compute_up(expr, ob, **kwargs):
    tp = expr.to
    shape = tp.shape
    if shape:
        raise TypeError(
            'cannot convert scalar object %r to array or matrix shape %r' % (
                ob,
                shape,
            ),
        )

    measure = tp.measure
    if isinstance(measure, Option):
        if pd.isnull(ob):
            return None
        measure = measure.ty
    dtype = to_numpy_dtype(measure)
    return dtype.type(ob)
    def __init__(self, path, datapath, mode='r', schema=None, dshape=None, **kwargs):
        self.path = path
        self.datapath = datapath
        self.mode = mode

        if schema and not dshape:
            dshape = 'var * ' + str(schema)

        # TODO: provide sane defaults for kwargs
        # Notably chunks and maxshape
        if dshape:
            dshape = datashape.dshape(dshape)
            shape = dshape.shape
            dtype = datashape.to_numpy_dtype(dshape[-1])
            if shape[0] == datashape.Var():
                kwargs['chunks'] = True
                kwargs['maxshape'] = kwargs.get('maxshape', (None,) + shape[1:])
                shape = (0,) + tuple(map(int, shape[1:]))

        with h5py.File(path, mode) as f:
            dset = f.get(datapath)
            if dset is None:
                if dshape is None:
                    raise ValueError('No dataset or dshape provided')
                else:
                    f.create_dataset(datapath, shape, dtype=dtype, **kwargs)
            else:
                dshape2 = datashape.from_numpy(dset.shape, dset.dtype)
                dshape = dshape2
                # TODO: test provided dshape against given dshape
                # if dshape and dshape != dshape2:
                #     raise ValueError('Inconsistent datashapes.'
                #             '\nGiven: %s\nFound: %s' % (dshape, dshape2))

        attributes = self.attributes()
        if attributes['chunks']:
            # is there a better way to do this?
            words = str(dshape).split(' * ')
            dshape = 'var * ' + ' * '.join(words[1:])
            dshape = datashape.dshape(dshape)

        self._dshape = dshape
        self._schema = schema
Exemple #38
0
def unit_to_dtype(ds):
    """ Convert a datashape Unit instance into a numpy dtype

    Parameters
    ----------
    ds : DataShape
        The DataShape instance to convert

    Returns
    -------
    np.dtype

    Examples
    --------
    >>> unit_to_dtype('int32')
    dtype('int32')
    >>> unit_to_dtype('float64')
    dtype('float64')
    >>> unit_to_dtype('?int64')
    dtype('float64')
    >>> unit_to_dtype('string')
    dtype('O')
    >>> unit_to_dtype('?datetime')
    dtype('<M8[us]')
    """
    if isinstance(ds, str):
        ds = dshape(ds)
    if isinstance(ds, DataShape):
        ds = ds.measure
    if isinstance(ds, Option) and isscalar(ds) and isnumeric(ds):
        if isinstance(ds.ty, Decimal):
            str_np_dtype = str(ds.ty.to_numpy_dtype()).replace('int', 'float')
            if str_np_dtype == 'float8':  # not a valid dtype, so increase
                str_np_dtype = 'float16'
            return unit_to_dtype(str_np_dtype)
        return unit_to_dtype(str(ds).replace('int', 'float').replace('?', ''))
    if isinstance(ds, Option) and isinstance(
        ds.ty, (Date, DateTime, String, TimeDelta)
    ):
        ds = ds.ty
    if ds == string:
        return np.dtype('O')
    return to_numpy_dtype(ds)
Exemple #39
0
def into(a, b, **kwargs):
    b = iter(b)
    first = next(b)
    b = toolz.concat([[first], b])
    if isinstance(first, datetime):
        b = map(np.datetime64, b)
    if isinstance(first, (list, tuple)):
        if 'dtype' in kwargs:
            dtype = kwargs.pop('dtype')
        elif 'dshape' in kwargs:
            dtype = to_numpy_dtype(dshape(kwargs.pop('dshape')))
        else:
            dtype = dtype_from_tuple(first)
        return np.rec.fromrecords([tuple(x) for x in b],
                                  dtype=dtype,
                                  **kwargs)
    elif hasattr(first, 'values'):
        #detecting sqlalchemy.engine.result.RowProxy types and similar
        return np.asarray([tuple(x.values()) for x in b], **kwargs)
    else:
        return np.asarray(list(b), **kwargs)
Exemple #40
0
def compute_up(expr, data, **kwargs):
    return data.astype(to_numpy_dtype(expr.schema))
Exemple #41
0
 def test_string(self):
     self.assertEqual(to_numpy_dtype(dshape('2 * string')), np.dtype('O'))
Exemple #42
0
def into(a, b):
    if b.iscolumn:
        return into(np.ndarray(0), compute(b),
                dtype=to_numpy_dtype(b.schema[0].types[0]))
    else:
        return into(np.ndarray(0), compute(b), dtype=to_numpy_dtype(b.schema))
Exemple #43
0
def array(obj, dshape=None, caps={'efficient-write': True},
          storage=None):
    """Create a Blaze array.

    Parameters
    ----------
    obj : array_like
        Initial contents for the array.

    dshape : datashape
        The datashape for the resulting array. By default the
        datashape will be inferred from data. If an explicit dshape is
        provided, the input data will be coerced into the provided
        dshape.

    caps : capabilities dictionary
        A dictionary containing the desired capabilities of the array.

    storage : Storage instance
        A Storage object with the necessary info for storing the data.

    Returns
    -------
    out : a concrete blaze array.

    Bugs
    ----
    Right now the explicit dshape is ignored. This needs to be
    corrected. When the data cannot be coerced to an explicit dshape
    an exception should be raised.

    """
    dshape = _normalize_dshape(dshape)

    storage = _storage_convert(storage)

    if isinstance(obj, Array):
        return obj
    elif isinstance(obj, IDataDescriptor):
        # TODO: Validate the 'caps', convert to another kind
        #       of data descriptor if necessary
        # Note by Francesc: but if it is already an IDataDescriptor I wonder
        # if `caps` should be ignored.  Hmm, probably not...
        #
        # Note by Oscar: Maybe we shouldn't accept a datadescriptor at
        #   all at this level. If you've got a DataDescriptor you are
        #   playing with internal datastructures anyways, go to the
        #   Array constructor directly. If you want to transform to
        #   another datadescriptor... convert it yourself (you are
        #   playing with internal datastructures, remember? you should
        #   be able to do it in your own.
        dd = obj
    elif storage is not None:
        dt = None if dshape is None else to_numpy_dtype(dshape)
        if inspect.isgenerator(obj):
            # TODO: Generator logic can go inside barray
            dd = BLZDataDescriptor(blz.barray(obj, dtype=dt, count=-1,
                                              rootdir=storage.path))
        else:
            dd = BLZDataDescriptor(
                blz.barray(obj, dtype=dt, rootdir=storage.path))
    elif 'efficient-write' in caps and caps['efficient-write'] is True:
        # In-Memory array
        if dshape is None:
            dd = DyNDDataDescriptor(nd.asarray(obj, access='rw'))
        else:
            # Use the uniform/full dtype specification in dynd depending
            # on whether the datashape has a uniform dim
            dt = ndt.type(str(dshape))
            if dt.ndim > 0:
                dd = DyNDDataDescriptor(nd.array(obj, type=dt, access='rw'))
            else:
                dd = DyNDDataDescriptor(nd.array(obj, dtype=dt, access='rw'))
    elif 'compress' in caps and caps['compress'] is True:
        dt = None if dshape is None else to_numpy_dtype(dshape)
        # BLZ provides compression
        if inspect.isgenerator(obj):
            # TODO: Generator logic can go inside barray
            dd = BLZDataDescriptor(blz.fromiter(obj, dtype=dt, count=-1))
        else:
            dd = BLZDataDescriptor(blz.barray(obj, dtype=dt))

    elif isinstance(obj, np.ndarray):
        dd = DyNDDataDescriptor(nd.view(obj))
    elif isinstance(obj, nd.array):
        dd = DyNDDataDescriptor(obj)
    elif isinstance(obj, blz.barray):
        dd = BLZDataDescriptor(obj)
    else:
        raise TypeError(('Failed to construct blaze array from '
                        'object of type %r') % type(obj))
    return Array(dd)
Exemple #44
0
def into(a, b, **kwargs):
    schema = dshape(str(b.schema).replace('?', ''))
    return into(np.ndarray(0), compute(b), dtype=to_numpy_dtype(schema))
 def test_timedelta(self):
     assert to_numpy_dtype(dshape('2 * timedelta')) == np.dtype('m8[us]')
     assert to_numpy_dtype(dshape("2 * timedelta[unit='s']")) == \
         np.dtype('m8[s]')
 def test_string(self):
     assert to_numpy_dtype(dshape('2 * string')) == np.dtype('O')
 def test_date(self):
     assert to_numpy_dtype(dshape('2 * date')) == np.dtype('M8[D]')
 def test_decimal(self):
     assert to_numpy_dtype(dshape('decimal[18,0]')) == np.int64
     assert to_numpy_dtype(dshape('decimal[7,2]')) == np.float64
     assert to_numpy_dtype(dshape('decimal[4]')) == np.int16
     with pytest.raises(TypeError):
         to_numpy_dtype(dshape('decimal[21]'))
Exemple #49
0
 def test_simple(self):
     self.assertEqual(to_numpy_dtype(dshape('2 * int32')), np.int32)
     self.assertEqual(to_numpy_dtype(dshape('2 * {x: int32, y: int32}')),
                      np.dtype([('x', '<i4'), ('y', '<i4')]))
Exemple #50
0
 def test_date(self):
     self.assertEqual(to_numpy_dtype(dshape('2 * date')), np.dtype('M8[D]'))
Exemple #51
0
def array(obj, dshape=None, caps={'efficient-write': True}, storage=None):
    """Create a Blaze array.

    Parameters
    ----------
    obj : array_like
        Initial contents for the array.

    dshape : datashape
        The datashape for the resulting array. By default the
        datashape will be inferred from data. If an explicit dshape is
        provided, the input data will be coerced into the provided
        dshape.

    caps : capabilities dictionary
        A dictionary containing the desired capabilities of the array.

    storage : Storage instance
        A Storage object with the necessary info for storing the data.

    Returns
    -------
    out : a concrete blaze array.

    Bugs
    ----
    Right now the explicit dshape is ignored. This needs to be
    corrected. When the data cannot be coerced to an explicit dshape
    an exception should be raised.

    """
    dshape = _normalize_dshape(dshape)

    storage = _storage_convert(storage)

    if isinstance(obj, Array):
        return obj
    elif isinstance(obj, IDataDescriptor):
        # TODO: Validate the 'caps', convert to another kind
        #       of data descriptor if necessary
        # Note by Francesc: but if it is already an IDataDescriptor I wonder
        # if `caps` should be ignored.  Hmm, probably not...
        #
        # Note by Oscar: Maybe we shouldn't accept a datadescriptor at
        #   all at this level. If you've got a DataDescriptor you are
        #   playing with internal datastructures anyways, go to the
        #   Array constructor directly. If you want to transform to
        #   another datadescriptor... convert it yourself (you are
        #   playing with internal datastructures, remember? you should
        #   be able to do it in your own.
        dd = obj
    elif storage is not None:
        dt = None if dshape is None else to_numpy_dtype(dshape)
        if inspect.isgenerator(obj):
            # TODO: Generator logic can go inside barray
            dd = BLZDataDescriptor(
                blz.barray(obj, dtype=dt, count=-1, rootdir=storage.path))
        else:
            dd = BLZDataDescriptor(
                blz.barray(obj, dtype=dt, rootdir=storage.path))
    elif 'efficient-write' in caps and caps['efficient-write'] is True:
        # In-Memory array
        if dshape is None:
            dd = DyNDDataDescriptor(nd.asarray(obj, access='rw'))
        else:
            # Use the uniform/full dtype specification in dynd depending
            # on whether the datashape has a uniform dim
            dt = ndt.type(str(dshape))
            if dt.ndim > 0:
                dd = DyNDDataDescriptor(nd.array(obj, type=dt, access='rw'))
            else:
                dd = DyNDDataDescriptor(nd.array(obj, dtype=dt, access='rw'))
    elif 'compress' in caps and caps['compress'] is True:
        dt = None if dshape is None else to_numpy_dtype(dshape)
        # BLZ provides compression
        if inspect.isgenerator(obj):
            # TODO: Generator logic can go inside barray
            dd = BLZDataDescriptor(blz.fromiter(obj, dtype=dt, count=-1))
        else:
            dd = BLZDataDescriptor(blz.barray(obj, dtype=dt))

    elif isinstance(obj, np.ndarray):
        dd = DyNDDataDescriptor(nd.view(obj))
    elif isinstance(obj, nd.array):
        dd = DyNDDataDescriptor(obj)
    elif isinstance(obj, blz.barray):
        dd = BLZDataDescriptor(obj)
    else:
        raise TypeError(('Failed to construct blaze array from '
                         'object of type %r') % type(obj))
    return Array(dd)
 def test_date(self):
     assert to_numpy_dtype(dshape('2 * date')) == np.dtype('M8[D]')
 def test_dimensions(self):
     return to_numpy_dtype(dshape('var * int32')) == np.int32
 def test_timedelta(self):
     assert to_numpy_dtype(dshape('2 * timedelta')) == np.dtype('m8[us]')
     assert to_numpy_dtype(dshape("2 * timedelta[unit='s']")) == \
         np.dtype('m8[s]')
 def test_dimensions(self):
     return to_numpy_dtype(dshape('var * int32')) == np.int32
Exemple #56
0
def compute_up(expr, data, **kwargs):
    return data.astype(to_numpy_dtype(expr.schema))
 def test_string(self):
     assert to_numpy_dtype(dshape('2 * string')) == np.dtype('O')