class Quantile(Reduction): arg = rlz.any quantile = rlz.strict_numeric interpolation = rlz.isin( {'linear', 'lower', 'higher', 'midpoint', 'nearest'}) output_dtype = dt.float64
class Bucket(BucketLike): arg = rlz.column(rlz.any) buckets = rlz.tuple_of(rlz.scalar(rlz.any)) closed = rlz.optional(rlz.isin({'left', 'right'}), default='left') close_extreme = rlz.optional(rlz.instance_of(bool), default=True) include_under = rlz.optional(rlz.instance_of(bool), default=False) include_over = rlz.optional(rlz.instance_of(bool), default=False) def __init__(self, buckets, include_under, include_over, **kwargs): if not len(buckets): raise ValueError('Must be at least one bucket edge') elif len(buckets) == 1: if not include_under or not include_over: raise ValueError('If one bucket edge provided, must have ' 'include_under=True and include_over=True') super().__init__( buckets=buckets, include_under=include_under, include_over=include_over, **kwargs, ) @property def nbuckets(self): return len(self.buckets) - 1 + self.include_over + self.include_under
class MultiQuantile(Quantile): arg = rlz.any quantile = rlz.value(dt.Array(dt.float64)) interpolation = rlz.isin( {'linear', 'lower', 'higher', 'midpoint', 'nearest'}) output_dtype = dt.Array(dt.float64)
class Correlation(Filterable, Reduction): """Coefficient of correlation of a set of number pairs.""" left = rlz.column(rlz.numeric) right = rlz.column(rlz.numeric) how = rlz.isin({'sample', 'pop'}) output_dtype = dt.float64
class Covariance(Filterable, Reduction): """Covariance of a set of number pairs.""" left = rlz.column(rlz.numeric) right = rlz.column(rlz.numeric) how = rlz.isin({'sample', 'pop'}) output_dtype = dt.float64
class TimestampFromUNIX(Value): arg = rlz.any # Only pandas-based backends support 'ns' unit = rlz.isin({'s', 'ms', 'us', 'ns'}) output_shape = rlz.shape_like('arg') output_dtype = dt.timestamp output_shape = rlz.shape_like("args")
class VarianceBase(Filterable, Reduction): arg = rlz.column(rlz.numeric) how = rlz.isin({'sample', 'pop'}) @immutable_property def output_dtype(self): if isinstance(self.arg, ir.DecimalValue): return self.arg.type().largest else: return dt.float64
class DropNa(TableNode, sch.HasSchema): """Drop null values in the table.""" table = rlz.table how = rlz.isin({'any', 'all'}) subset = rlz.optional(rlz.tuple_of(rlz.column_from("table")), default=()) @property def schema(self): return self.table.schema()
class IntervalFromInteger(Value): arg = rlz.integer unit = rlz.isin({'Y', 'Q', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'ns'}) output_shape = rlz.shape_like("arg") @immutable_property def output_dtype(self): return dt.Interval(self.unit, self.arg.type()) @property def resolution(self): return self.output_dtype.resolution
class ParseURL(Value): arg = rlz.string extract = rlz.isin({ 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', 'USERINFO', 'QUERY', }) key = rlz.optional(rlz.string) output_shape = rlz.shape_like("arg") output_dtype = dt.string
class Histogram(BucketLike): arg = Arg(rlz.noop) nbins = Arg(rlz.noop, default=None) binwidth = Arg(rlz.noop, default=None) base = Arg(rlz.noop, default=None) closed = Arg(rlz.isin({'left', 'right'}), default='left') aux_hash = Arg(rlz.noop, default=None) def _validate(self): if self.nbins is None: if self.binwidth is None: raise ValueError('Must indicate nbins or binwidth') elif self.binwidth is not None: raise ValueError('nbins and binwidth are mutually exclusive') def output_type(self): # always undefined cardinality (for now) return dt.category.column_type()
class Bucket(BucketLike): arg = Arg(rlz.noop) buckets = Arg(rlz.noop) closed = Arg(rlz.isin({'left', 'right'}), default='left') close_extreme = Arg(bool, default=True) include_under = Arg(bool, default=False) include_over = Arg(bool, default=False) def _validate(self): if not len(self.buckets): raise ValueError('Must be at least one bucket edge') elif len(self.buckets) == 1: if not self.include_under or not self.include_over: raise ValueError('If one bucket edge provided, must have ' 'include_under=True and include_over=True') @property def nbuckets(self): return len(self.buckets) - 1 + self.include_over + self.include_under
class Histogram(BucketLike): arg = rlz.numeric nbins = rlz.optional(rlz.instance_of(int)) binwidth = rlz.optional(rlz.scalar(rlz.numeric)) base = rlz.optional(rlz.scalar(rlz.numeric)) closed = rlz.optional(rlz.isin({'left', 'right'}), default='left') aux_hash = rlz.optional(rlz.instance_of(str)) def __init__(self, nbins, binwidth, **kwargs): if nbins is None: if binwidth is None: raise ValueError('Must indicate nbins or binwidth') elif binwidth is not None: raise ValueError('nbins and binwidth are mutually exclusive') super().__init__(nbins=nbins, binwidth=binwidth, **kwargs) @property def output_dtype(self): # always undefined cardinality (for now) return dt.category
class ToIntervalUnit(Value): arg = rlz.interval unit = rlz.isin({'Y', 'Q', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'ns'}) output_shape = rlz.shape_like("arg") def __init__(self, arg, unit): dtype = arg.type() if dtype.unit != unit: arg = util.convert_unit(arg, dtype.unit, unit) super().__init__(arg=arg, unit=unit) @immutable_property def output_dtype(self): dtype = self.arg.type() return dt.Interval( unit=self.unit, value_type=dtype.value_type, nullable=dtype.nullable, )
def test_invalid_isin(values, value, expected): with pytest.raises(expected): rlz.isin(values, value)
def test_valid_isin(values, value, expected): assert rlz.isin(values, value) == expected
class Arbitrary(Filterable, Reduction): arg = rlz.column(rlz.any) how = rlz.optional(rlz.isin({'first', 'last', 'heavy'})) output_dtype = rlz.dtype_like('arg')
class Hash(ValueOp): arg = Arg(rlz.any) how = Arg(rlz.isin({"fnv", "farm_fingerprint"})) output_type = rlz.shape_like("arg", dt.int64)
class HashBytes(ValueOp): arg = Arg(rlz.one_of([rlz.value(dt.string), rlz.value(dt.binary)])) how = Arg(rlz.isin({"sha256", "farm_fingerprint"})) output_type = rlz.shape_like("arg", "binary")
class TimeTruncate(Value): arg = rlz.time unit = rlz.isin(_time_units) output_shape = rlz.shape_like("arg") output_dtype = dt.time
class DateTruncate(Value): arg = rlz.date unit = rlz.isin(_date_units) output_shape = rlz.shape_like("arg") output_dtype = dt.date
class Hash(Value): arg = rlz.any how = rlz.isin({'fnv', 'farm_fingerprint'}) output_dtype = dt.int64 output_shape = rlz.shape_like("arg")
class HashBytes(Value): arg = rlz.one_of({rlz.value(dt.string), rlz.value(dt.binary)}) how = rlz.isin({'md5', 'sha1', 'sha256', 'sha512'}) output_dtype = dt.binary output_shape = rlz.shape_like("arg")