def new_dataset(expr, deltas): """Creates or returns a dataset from a pair of blaze expressions. Parameters ---------- expr : Expr The blaze expression representing the first known values. deltas : Expr The blaze expression representing the deltas to the data. Returns ------- ds : type A new dataset type. Notes ----- This function is memoized. repeated calls with the same inputs will return the same type. """ columns = {} for name, type_ in expr.dshape.measure.fields: try: if promote(type_, float64, promote_option=False) != float64: raise NotPipelineCompatible() if isinstance(type_, Option): type_ = type_.ty except NotPipelineCompatible: col = NonPipelineField(name, type_) except TypeError: col = NonNumpyField(name, type_) else: col = Column(type_.to_numpy_dtype().type) columns[name] = col name = expr._name if name is None: name = next(_new_names) # unicode is a name error in py3 but the branch is only hit # when we are in python 2. if PY2 and isinstance(name, unicode): # noqa name = name.encode('utf-8') return type(name, (DataSet,), columns)
def coalesce(a, b): a_dshape = discover(a) a_measure = a_dshape.measure isoption = isinstance(a_measure, Option) if isoption: a_measure = a_measure.ty isnull = isinstance(a_measure, Null) if isnull: # a is always null, this is just b return b if not isoption: # a is not an option, this is just a return a b_dshape = discover(b) return Coalesce(a, b, DataShape(*( maxshape((a_dshape.shape, b_dshape.shape)) + (promote(a_measure, b_dshape.measure),) )))
def promote(type1, type2): """Promote two types to a common type""" from flypy.compiler.typing import inference if type1 == type2: return type1 elif (type(type1), type(type2)) == (inference.Method, inference.Method): # promote Method types # TODO: Bit of a hack, do this better func1, obj1 = type1.parameters func2, obj2 = type2.parameters result = promote(obj1, obj2) if result == obj1: return type1 elif result == obj2: return type2 else: raise TypeError("Cannot promote methods %s and %s" % (type1, type2)) else: t1, t2 = to_blaze(type1), to_blaze(type2) result = ds.promote(t1, t2) return resolve_type(result)
def schema(self): """ Examples -------- >>> from blaze import symbol >>> t = symbol('t', 'var * {name: string, amount: int}') >>> s = symbol('t', 'var * {name: string, id: int}') >>> join(t, s).schema dshape("{name: string, amount: int32, id: int32}") >>> join(t, s, how='left').schema dshape("{name: string, amount: int32, id: ?int32}") Overlapping but non-joined fields append _left, _right >>> a = symbol('a', 'var * {x: int, y: int}') >>> b = symbol('b', 'var * {x: int, y: int}') >>> join(a, b, 'x').fields ['x', 'y_left', 'y_right'] """ option = lambda dt: dt if isinstance(dt, Option) else Option(dt) on_left = self.on_left if not isinstance(on_left, list): on_left = on_left, on_right = self.on_right if not isinstance(on_right, list): on_right = on_right, right_types = keymap( dict(zip(on_right, on_left)).get, self.rhs.dshape.measure.dict, ) joined = ( (name, promote(dt, right_types[name], promote_option=False)) for n, (name, dt) in enumerate(filter( compose(op.contains(on_left), first), self.lhs.dshape.measure.fields, )) ) left = [ (name, dt) for name, dt in zip( self.lhs.fields, types_of_fields(self.lhs.fields, self.lhs) ) if name not in on_left ] right = [ (name, dt) for name, dt in zip( self.rhs.fields, types_of_fields(self.rhs.fields, self.rhs) ) if name not in on_right ] # Handle overlapping but non-joined case, e.g. left_other = set(name for name, dt in left if name not in on_left) right_other = set(name for name, dt in right if name not in on_right) overlap = left_other & right_other left_suffix, right_suffix = self.suffixes left = ((name + left_suffix if name in overlap else name, dt) for name, dt in left) right = ((name + right_suffix if name in overlap else name, dt) for name, dt in right) if self.how in ('right', 'outer'): left = ((name, option(dt)) for name, dt in left) if self.how in ('left', 'outer'): right = ((name, option(dt)) for name, dt in right) return dshape(Record(chain(joined, left, right)))
def test_simple(): x = int64 y = float32 z = promote(x, y) assert z == float64
def test_no_promote_option(): x = int64 y = Option(float64) z = promote(x, y, promote_option=False) assert z == float64
def test_option(): x = int64 y = Option(float32) z = promote(x, y) assert z == Option(float64)
def _dtype(self): # we can't simply use .schema or .datashape because we may have a bare # integer, for example lhs, rhs = discover(self.lhs).measure, discover(self.rhs).measure return promote(lhs, rhs)
def schema(self): """ Examples -------- >>> from blaze import symbol >>> t = symbol('t', 'var * {name: string, amount: int}') >>> s = symbol('t', 'var * {name: string, id: int}') >>> join(t, s).schema dshape("{name: string, amount: int32, id: int32}") >>> join(t, s, how='left').schema dshape("{name: string, amount: int32, id: ?int32}") Overlapping but non-joined fields append _left, _right >>> a = symbol('a', 'var * {x: int, y: int}') >>> b = symbol('b', 'var * {x: int, y: int}') >>> join(a, b, 'x').fields ['x', 'y_left', 'y_right'] """ option = lambda dt: dt if isinstance(dt, Option) else Option(dt) on_left = self.on_left if not isinstance(on_left, list): on_left = on_left, on_right = self.on_right if not isinstance(on_right, list): on_right = on_right, right_types = keymap( dict(zip(on_right, on_left)).get, self.rhs.dshape.measure.dict, ) joined = ((name, promote(dt, right_types[name], promote_option=False)) for n, (name, dt) in enumerate( filter( compose(op.contains(on_left), first), self.lhs.dshape.measure.fields, ))) left = [(name, dt) for name, dt in zip( self.lhs.fields, types_of_fields(self.lhs.fields, self.lhs)) if name not in on_left] right = [(name, dt) for name, dt in zip( self.rhs.fields, types_of_fields(self.rhs.fields, self.rhs)) if name not in on_right] # Handle overlapping but non-joined case, e.g. left_other = set(name for name, dt in left if name not in on_left) right_other = set(name for name, dt in right if name not in on_right) overlap = left_other & right_other left_suffix, right_suffix = self.suffixes left = ((name + left_suffix if name in overlap else name, dt) for name, dt in left) right = ((name + right_suffix if name in overlap else name, dt) for name, dt in right) if self.how in ('right', 'outer'): left = ((name, option(dt)) for name, dt in left) if self.how in ('left', 'outer'): right = ((name, option(dt)) for name, dt in right) return dshape(Record(chain(joined, left, right)))
def new_dataset(expr, deltas, missing_values): """ Creates or returns a dataset from a pair of blaze expressions. Parameters ---------- expr : Expr The blaze expression representing the first known values. deltas : Expr The blaze expression representing the deltas to the data. missing_values : frozenset((name, value) pairs Association pairs column name and missing_value for that column. This needs to be a frozenset rather than a dict or tuple of tuples because we want a collection that's unordered but still hashable. Returns ------- ds : type A new dataset type. Notes ----- This function is memoized. repeated calls with the same inputs will return the same type. """ missing_values = dict(missing_values) columns = {} for name, type_ in expr.dshape.measure.fields: # Don't generate a column for sid or timestamp, since they're # implicitly the labels if the arrays that will be passed to pipeline # Terms. if name in (SID_FIELD_NAME, TS_FIELD_NAME): continue try: # TODO: This should support datetime and bool columns. if promote(type_, float64, promote_option=False) != float64: raise NotPipelineCompatible() if isinstance(type_, Option): type_ = type_.ty except NotPipelineCompatible: col = NonPipelineField(name, type_) except TypeError: col = NonNumpyField(name, type_) else: col = Column( type_.to_numpy_dtype(), missing_values.get(name, NotSpecified), ) columns[name] = col name = expr._name if name is None: name = next(_new_names) # unicode is a name error in py3 but the branch is only hit # when we are in python 2. if PY2 and isinstance(name, unicode): # noqa name = name.encode('utf-8') return type(name, (DataSet,), columns)
def test_promote_string_with_option(x, y, p, r): assert (promote(x, y, promote_option=p) == promote(y, x, promote_option=p) == r)
def test_promote_datetime_with_option(x, y, p, r): assert (promote(x, y, promote_option=p) == promote(y, x, promote_option=p) == r)