コード例 #1
0
 def dshape(self):
     measure = Record(list(zip(self.names,
                               [v._dtype for v in self.values])))
     if self.keepdims:
         return DataShape(*((1,) * self._child.ndim + (measure,)))
     else:
         return DataShape(measure)
コード例 #2
0
def Data(data,
         dshape=None,
         name=None,
         fields=None,
         columns=None,
         schema=None,
         **kwargs):
    sub_uri = ''
    if isinstance(data, _strtypes):
        if '::' in data:
            data, sub_uri = data.split('::')
        data = resource(data,
                        schema=schema,
                        dshape=dshape,
                        columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator)
            and not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if columns:
        warnings.warn("columns kwarg deprecated.  Use fields instead",
                      DeprecationWarning)
    if columns and not fields:
        fields = columns
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure, ) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema, )))
        elif isrecord(dshape.measure) and fields:
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))

    ds = datashape.dshape(dshape)

    name = name or next(names)
    result = InteractiveSymbol(data, ds, name)

    if sub_uri:
        for field in sub_uri.split('/'):
            if field:
                result = result[field]

    return result
コード例 #3
0
ファイル: interactive.py プロジェクト: wegamekinglc/blaze
def Data(data,
         dshape=None,
         name=None,
         fields=None,
         columns=None,
         schema=None,
         **kwargs):
    if columns:
        raise ValueError("columns argument deprecated, use fields instead")
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")

    if isinstance(data, InteractiveSymbol):
        return Data(data.data, dshape, name, fields, columns, schema, **kwargs)

    if isinstance(data, _strtypes):
        data = resource(data,
                        schema=schema,
                        dshape=dshape,
                        columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator)
            and not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure, ) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema, )))
        elif isrecord(dshape.measure) and fields:
            ds = discover(data)
            assert isrecord(ds.measure)
            names = ds.measure.names
            if names != fields:
                raise ValueError(
                    'data column names %s\n'
                    '\tnot equal to fields parameter %s,\n'
                    '\tuse Data(data).relabel(%s) to rename '
                    'fields' %
                    (names, fields, ', '.join('%s=%r' % (k, v)
                                              for k, v in zip(names, fields))))
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))

    ds = datashape.dshape(dshape)
    return InteractiveSymbol(data, ds, name)
コード例 #4
0
    def __init__(self,
                 data,
                 dshape=None,
                 name=None,
                 fields=None,
                 columns=None,
                 schema=None,
                 **kwargs):
        if isinstance(data, _strtypes):
            data = resource(data,
                            schema=schema,
                            dshape=dshape,
                            columns=columns,
                            **kwargs)
        if columns:
            warnings.warn("columns kwarg deprecated.  Use fields instead",
                          DeprecationWarning)
        if columns and not fields:
            fields = columns
        if schema and dshape:
            raise ValueError("Please specify one of schema= or dshape= keyword"
                             " arguments")
        if schema and not dshape:
            dshape = var * schema
        if dshape and isinstance(dshape, _strtypes):
            dshape = datashape.dshape(dshape)
        if not dshape:
            dshape = discover(data)
            types = None
            if isinstance(dshape.measure, Tuple) and fields:
                types = dshape[1].dshapes
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))
            elif isscalar(dshape.measure) and fields:
                types = (dshape.measure, ) * int(dshape[-2])
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))
            elif isrecord(dshape.measure) and fields:
                types = dshape.measure.types
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))

        self.dshape = datashape.dshape(dshape)

        self.data = data

        if (hasattr(data, 'schema') and isinstance(data.schema,
                                                   (DataShape, str, unicode))
                and self.schema != data.schema):
            raise TypeError('%s schema %s does not match %s schema %s' %
                            (type(data).__name__, data.schema,
                             type(self).__name__, self.schema))

        self._name = name or next(names)
コード例 #5
0
ファイル: h5py.py プロジェクト: quasiben/odo
def discover_h5py_dataset(d):
    dshape = datashape.from_numpy(d.shape, d.dtype)
    shape, measure = dshape.shape, dshape.measure
    if not isrecord(measure):
        if dshape == datashape.object_:
            args = shape + (datashape.string, )
            return DataShape(*args)
        return dshape
    else:
        records = list(
            record_dshape_replace(measure, datashape.object_,
                                  datashape.string))
        args = shape + (datashape.Record(records), )
        return DataShape(*args)
コード例 #6
0
ファイル: dynd.py プロジェクト: debugger22/into
def list_to_dynd(L, **kwargs):
    ds = kwargs['dshape']
    if isinstance(ds.measure, Tuple):
        measure = Record([['f%d' % i, typ]
                          for i, typ in enumerate(ds.measure.parameters[0])])
        ds = DataShape(*(ds.shape + (measure, )))
    return nd.array(L, dtype=str(ds))
コード例 #7
0
ファイル: dask.py プロジェクト: snmz216/blaze
def compute_up(expr, data, **kwargs):
    leaf = expr._leaves()[0]
    chunk = symbol(
        'chunk',
        DataShape(*(tuple(map(first, data.chunks)) + (leaf.dshape.measure, ))))
    (chunk, chunk_expr), (agg, agg_expr) = split(expr._child,
                                                 expr,
                                                 chunk=chunk)

    inds = tuple(range(ndim(leaf)))
    dtype = expr.dshape.measure.to_numpy_dtype()
    tmp = atop(
        curry(compute_it, chunk_expr, [chunk], **kwargs),
        inds,
        data,
        inds,
        dtype=dtype,
    )

    return atop(
        compose(
            curry(compute_it, agg_expr, [agg], **kwargs),
            curry(_concatenate2, axes=expr.axis),
        ),
        tuple(i for i in inds if i not in expr.axis),
        tmp,
        inds,
        dtype=dtype,
    )
コード例 #8
0
 def _schema(self):
     schema = self._child.schema[0]
     if isinstance(schema, Record) and len(schema.types) == 1:
         result = toolz.first(schema.types)
     else:
         result = schema
     return DataShape(result)
コード例 #9
0
 def __init__(self, name, dshape):
     self._name = name
     if isinstance(dshape, _strtypes):
         dshape = datashape.dshape(dshape)
     if isinstance(dshape, Mono) and not isinstance(dshape, DataShape):
         dshape = DataShape(dshape)
     self.dshape = dshape
コード例 #10
0
    def dshape(self):
        shape = self._child.dshape.shape
        schema = self._child.dshape.measure.dict[self._name]

        shape = shape + schema.shape
        schema = (schema.measure, )
        return DataShape(*(shape + schema))
コード例 #11
0
ファイル: sql.py プロジェクト: jimmyaspire/odo
def discover(metadata):
    try:
        metadata.reflect(views=metadata.bind.dialect.supports_views)
    except NotImplementedError:
        metadata.reflect()
    pairs = []
    for table in sorted(metadata.tables.values(), key=attrgetter('name')):
        name = table.name
        try:
            pairs.append([name, discover(table)])
        except sa.exc.CompileError as e:
            warnings.warn(
                "Can not discover type of table {name}.\n"
                "SQLAlchemy provided this error message:\n\t{msg}"
                "\nSkipping.".format(
                    name=name,
                    msg=e.message,
                ),
                stacklevel=3,
            )
        except NotImplementedError as e:
            warnings.warn(
                "Odo does not understand a SQLAlchemy type.\n"
                "Odo provided the following error:\n\t{msg}"
                "\nSkipping.".format(msg="\n\t".join(e.args)),
                stacklevel=3,
            )
    return DataShape(Record(pairs))
コード例 #12
0
    def _dshape(self):
        '''
        since pandas supports concat for string columns, do the same for blaze
        '''
        shape = self.lhs.dshape.shape
        if isinstance(self.lhs.schema.measure, Option):
            schema = self.lhs.schema
        elif isinstance(self.rhs.schema.measure, Option):
            schema = self.rhs.schema
        else:
            _, lhs_encoding = self.lhs.schema.measure.parameters
            _, rhs_encoding = self.rhs.schema.measure.parameters
            assert lhs_encoding == rhs_encoding
            # convert fixed length string to variable length string
            schema = DataShape(String(None, lhs_encoding))

        return DataShape(*(shape + (schema, )))
コード例 #13
0
    def schema(self):
        subs = dict(self.labels)
        d = self._child.dshape.measure.dict

        return DataShape(
            Record([[subs.get(name, name), dtype]
                    for name, dtype in self._child.dshape.measure.parameters[0]
                    ]))
コード例 #14
0
ファイル: collections.py プロジェクト: wegamekinglc/blaze
 def dshape(self):
     axis = self.axis
     ldshape = self.lhs.dshape
     lshape = ldshape.shape
     return DataShape(
         *(lshape[:axis] +
           (_shape_add(lshape[axis], self.rhs.dshape.shape[axis]), ) +
           lshape[axis + 1:] + (ldshape.measure, )))
コード例 #15
0
 def _schema(self):
     measure = self._child.schema.measure
     base = getattr(measure, 'ty', measure)
     return_type = Option if isinstance(measure, Option) else toolz.identity
     return DataShape(
         return_type(
             base if isinstance(base, Decimal) else
             base if isinstance(base, TimeDelta) else ct.float64, ))
コード例 #16
0
ファイル: json.py プロジェクト: pskyp/shareapplication
def date_to_datetime_dshape(ds):
    shape = ds.shape
    if isinstance(ds.measure, Record):
        measure = Record([[name, ct.datetime_ if typ == ct.date_ else typ]
                          for name, typ in ds.measure.parameters[0]])
    else:
        measure = ds.measure
    return DataShape(*(shape + (measure, )))
コード例 #17
0
ファイル: constructors.py プロジェクト: xsixing/blaze
def sql_table(table, colnames, measures, conn):
    """
    Create a new blaze Array from an SQL table description. This returns
    a Record array.
    """
    dtype = Record(list(zip(colnames, measures)))
    record_dshape = DataShape(coretypes.Var(), dtype)
    table = TableSelection(table, '*')
    return Array(SQLDataDescriptor(record_dshape, table, conn))
コード例 #18
0
 def _dshape(self):
     axis = self.axis
     if self.keepdims:
         shape = tuple(1 if i in axis else d
                       for i, d in enumerate(self._child.shape))
     else:
         shape = tuple(d for i, d in enumerate(self._child.shape)
                       if i not in axis)
     return DataShape(*(shape + (self.schema, )))
コード例 #19
0
    def _dshape(self):
        shape = self._child.dshape.shape
        measure = self._child.dshape.measure

        # TODO: is this too special-case-y?
        schema = getattr(measure, 'value', measure).dict[self._name]

        shape = shape + schema.shape
        schema = (schema.measure,)
        return DataShape(*(shape + schema))
コード例 #20
0
def fsql(engine, fcsv, name):
    dshape = discover(fcsv)
    dshape = DataShape(
        var, Record([(n, typ) for n, typ in zip('ab', dshape.measure.types)]))
    try:
        t = resource('%s::%s' % (url, name), dshape=dshape)
    except sqlalchemy.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        yield t
        drop(t)
コード例 #21
0
 def _dshape(self):
     axis = self.axis
     if self.keepdims:
         shape = tuple(1 if i in axis else d
                       for i, d in enumerate(self._child.shape))
     else:
         shape = tuple(d for i, d in enumerate(self._child.shape)
                       if i not in axis)
     measure = Record(list(zip(self.names,
                               [v.schema for v in self.values])))
     return DataShape(*(shape + (measure, )))
コード例 #22
0
    def dshape(self):
        # Compute shape
        shape = tuple([d for i, d in enumerate(self.lhs.shape)
                         if i not in self._left_axes] +
                      [d for i, d in enumerate(self.rhs.shape)
                         if i not in self._right_axes])

        # Compute measure by mimicking a mul and add
        l = symbol('l', self.lhs.dshape.measure)
        r = symbol('r', self.rhs.dshape.measure)
        measure = ((l * r) + (l * r)).dshape.measure

        return DataShape(*(shape + (measure,)))
コード例 #23
0
def discover(metadata):
    metadata.reflect(views=metadata.bind.dialect.supports_views)
    pairs = []
    for name, table in sorted(metadata.tables.items(), key=first):
        try:
            pairs.append([name, discover(table)])
        except sa.exc.CompileError as e:
            print("Can not discover type of table %s.\n" % name +
                "SQLAlchemy provided this error message:\n\t%s" % e.message +
                "\nSkipping.")
        except NotImplementedError as e:
            print("Blaze does not understand a SQLAlchemy type.\n"
                "Blaze provided the following error:\n\t%s" % e.message +
                "\nSkipping.")
    return DataShape(Record(pairs))
コード例 #24
0
def column_dshape(dshape, colname):
    """
    Given a record dshape, project a column out
    """
    rec = dshape.measure

    if not isinstance(rec, Record):
        raise TypeError("Can only select fields from record type")
    if colname not in rec.fields:
        raise ValueError("No such field %r" % (colname, ))

    measure = rec.fields[colname]
    params = list(dshape.shape) + [measure]
    dshape = DataShape(*params)

    return dshape
コード例 #25
0
def compute_down(expr, data, **kwargs):
    """ Compute expressions on H5Py datasets by operating on chunks

    This uses blaze.expr.split to break a full-array-computation into a
    per-chunk computation and a on-aggregate computation.

    This uses blaze.partition to pick out chunks from the h5py dataset, uses
    compute(numpy) to compute on each chunk and then uses blaze.partition to
    aggregate these (hopefully smaller) intermediate results into a local
    numpy array.  It then performs a second operation (again given by
    blaze.expr.split) on this intermediate aggregate

    The expression must contain some sort of Reduction.  Both the intermediate
    result and the final result are assumed to fit into memory
    """
    leaf = expr._leaves()[0]
    if not any(isinstance(node, Reduction) for node in path(expr, leaf)):
        raise MDNotImplementedError()

    # Compute chunksize (this should be improved)
    chunksize = kwargs.get('chunksize', data.chunks)

    # Split expression into per-chunk and on-aggregate pieces
    chunk = Symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure, ))))
    (chunk, chunk_expr), (agg, agg_expr) = \
            split(leaf, expr, chunk=chunk)

    # Create numpy array to hold intermediate aggregate
    shape, dtype = to_numpy(agg.dshape)
    intermediate = np.empty(shape=shape, dtype=dtype)

    # Compute partitions
    data_partitions = partitions(data, chunksize=chunksize)
    int_partitions = partitions(intermediate, chunksize=chunk_expr.shape)

    # For each partition, compute chunk->chunk_expr
    # Insert into intermediate
    # This could be parallelized
    for d, i in zip(data_partitions, int_partitions):
        chunk_data = partition_get(data, d, chunksize=chunksize)
        result = compute(chunk_expr, {chunk: chunk_data})
        partition_set(intermediate, i, result, chunksize=chunk_expr.shape)

    # Compute on the aggregate
    return compute(agg_expr, {agg: intermediate})
コード例 #26
0
def compute_down(expr, data, map=None, **kwargs):
    """ Compute expressions on H5Py datasets by operating on chunks

    This uses blaze.expr.split to break a full-array-computation into a
    per-chunk computation and a on-aggregate computation.

    This uses blaze.partition to pick out chunks from the h5py dataset, uses
    compute(numpy) to compute on each chunk and then uses blaze.partition to
    aggregate these (hopefully smaller) intermediate results into a local
    numpy array.  It then performs a second operation (again given by
    blaze.expr.split) on this intermediate aggregate

    The expression must contain some sort of Reduction.  Both the intermediate
    result and the final result are assumed to fit into memory
    """
    map = _get_map(map)

    leaf = expr._leaves()[0]
    if not any(isinstance(node, Reduction) for node in path(expr, leaf)):
        raise MDNotImplementedError()

    # Compute chunksize (this should be improved)
    chunksize = kwargs.get('chunksize', data.chunks)

    # Split expression into per-chunk and on-aggregate pieces
    chunk = symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure,))))
    (chunk, chunk_expr), (agg, agg_expr) = \
            split(leaf, expr, chunk=chunk)

    # Create numpy array to hold intermediate aggregate
    shape, dtype = to_numpy(agg.dshape)
    intermediate = np.empty(shape=shape, dtype=dtype)

    # Compute partitions
    source_parts = list(partitions(data, chunksize=chunksize, keepdims=True))
    target_parts = list(partitions(intermediate, chunksize=chunk_expr.shape,
                                   keepdims=True))

    list(map(
        curry(compute_chunk, data, intermediate, chunk, chunk_expr),
        zip(source_parts, target_parts)
    ))

    # Compute on the aggregate
    return compute(agg_expr, {agg: intermediate}, return_type='native')
コード例 #27
0
ファイル: datadescriptor.py プロジェクト: xsixing/blaze
    def dynd_arr(self):
        # TODO: This should really use blz
        if self._dynd_result is not None:
            return self._dynd_result

        # Allocate empty dynd array
        length = sum(len(chunk) for chunk in self.query_result)
        ds = DataShape(length, self.dshape.measure)
        result = nd.empty(str(ds))

        # Fill dynd array with chunks
        offset = 0
        for chunk in self.query_result:
            result[offset:offset + len(chunk)] = chunk
            offset += len(chunk)

        self._dynd_result = result
        return result
コード例 #28
0
def dynd_chunk_iterator(result, chunk_size=1024):
    """
    Turn a query Result into a bunch of DyND arrays
    """
    cursor = result.cursor

    chunk_size = max(cursor.arraysize, chunk_size)
    while True:
        try:
            results = cursor.fetchmany(chunk_size)
        except db.Error:
            break

        if not results:
            break

        dshape = DataShape(len(results), result.dshape.measure)
        chunk = nd.empty(str(dshape))
        chunk[:] = list(iter_result(results, dshape))
        yield chunk
コード例 #29
0
def coalesce(a, b):
    a_dshape = discover(a)
    a_measure = a_dshape.measure
    isoption = isinstance(a_measure, Option)
    if isoption:
        a_measure = a_measure.ty
    isnull = isinstance(a_measure, Null)
    if isnull:
        # a is always null, this is just b
        return b

    if not isoption:
        # a is not an option, this is just a
        return a

    b_dshape = discover(b)
    return Coalesce(a, b, DataShape(*(
        maxshape((a_dshape.shape, b_dshape.shape)) +
        (promote(a_measure, b_dshape.measure),)
    )))
コード例 #30
0
def sql_table(table_name, colnames, measures, conn):
    """
    Create a new blaze Array from an SQL table description. This returns
    a Record array.

    Parameters
    ==========

    table_name: str
        table name

    colnames: [str]
        column names

    measures: [DataShape]
        measure (element type) for each column

    conn: pyodbc/whatever Connection
    """
    dtype = Record(list(zip(colnames, measures)))
    record_dshape = DataShape(coretypes.Var(), dtype)
    table = TableSelection(table_name, '*')
    return Array(SQL_DDesc(record_dshape, table, conn))