def test_count_unique(self): assert count_unique(EMPTY_VECTOR) == 0 assert count_unique(Vector([1])) == 1 assert count_unique(Vector([1, 1])) == 1 assert count_unique(Vector([1, 1, 2])) == 2 assert count_unique(Vector([1, 1, 2, NaN])) == 3 assert count_unique(Vector([1, 1, 2, NaN]), drop_na=True) == 2
def test___new___na_datetime(self): # Should be converted to np.datetime64. # Missing values should be NaT. a = Vector([DATETIME, NaN, NaT, None]) b = Vector([DATETIME, NaT, NaT, NaT]) assert a.is_datetime() assert a.equal(b)
def test___new___na_integer(self): # Should be upcast to float. # Missing values should be NaN. a = Vector([1, 2, NaN, None]) b = Vector([1, 2, NaN, NaN]) assert a.is_float() assert a.equal(b)
def test_as_string_length(self): a = Vector([""]).as_string() a[0] = "hello" assert a[0] == "h" a = Vector([""]).as_string(5) a[0] = "hello" assert a[0] == "hello"
def test___new___na_boolean(self): # Should be upcast to object. # Missing values should be None. a = Vector([True, False, NaN, None]) b = Vector([True, False, None, None]) assert a.is_object() assert a.equal(b)
def test_fast_iterator(self): a = Vector.fast(list(range(10))) b = Vector.fast(range(10)) c = Vector.fast(x for x in range(10)) d = Vector.fast(map(lambda x: x, range(10))) assert a.equal(b) assert a.equal(c) assert a.equal(d)
def __new__(cls, object, dtype=None, nrow=None): object = cls._sequencify(object) column = Vector(object, dtype) if nrow is not None and nrow != column.length: if column.length != 1 or nrow < 1: raise ValueError("Bad arguments for broadcast") column = column.repeat(nrow) return column.view(cls)
def test___new___na_object(self): # Missing values should be None. a = Vector(["a", "b", "", NaN, None], object) assert a.is_object() assert a[0] == "a" assert a[1] == "b" assert a[2] == "" assert a[3] is None assert a[4] is None
def aggregate(self, **colname_function_pairs): """ Return group-wise calculated summaries. Usually aggregation is preceded by grouping, which can be conveniently written via method chaining as ``data.group_by(...).aggregate(...)``. In `colname_function_pairs`, `function` receives as an argument a data frame object, a group-wise subset of all rows. It should return a scalar value. Common aggregation functions have shorthand helpers available under :mod:`dataiter`, see the guide on :doc:`aggregation </aggregation>` for details. >>> data = di.read_csv("data/listings.csv") >>> # The below aggregations are identical. Usually you'll get by >>> # with the shorthand helpers, but for complicated calculations, >>> # you might need custom lambda functions. >>> data.group_by("hood").aggregate(n=di.count(), price=di.mean("price")) >>> data.group_by("hood").aggregate(n=lambda x: x.nrow, price=lambda x: x.price.mean()) """ group_colnames = self._group_colnames data = self.sort(**dict.fromkeys(group_colnames, 1)) data._index_ = np.arange(data.nrow) stat = data.unique(*group_colnames).select("_index_", *group_colnames) indices = np.split(data._index_, stat._index_[1:]) group_aware = [getattr(x, "group_aware", False) for x in colname_function_pairs.values()] if any(group_aware): groups = Vector.fast(range(len(indices)), int) n = Vector.fast(map(len, indices), int) data._group_ = np.repeat(groups, n) slices = None for colname, function in colname_function_pairs.items(): if getattr(function, "group_aware", False): # function might leave Nones in its output, # once those are replaced with the proper default # we can do a fast conversion to DataFrameColumn. column = function(data) default = function.default for i in range(len(column)): if column[i] is None: column[i] = default assert len(column) == stat.nrow column = DataFrameColumn.fast(column) stat[colname] = column else: # When using an arbitrary function, we cannot know # what special values to expect and thus we end up # needing to use the slow Vector.__init__. if slices is None: slices = [data._view_rows(x) for x in indices] stat[colname] = [function(x) for x in slices] return stat.unselect("_index_", "_group_")
def test_rank_without_ties(self): # Without ties, all methods should give the same result. a = Vector([NaN, 3, 2, 4, 5, 1]) assert a.rank(method="min").equal(a.rank(method="average")) assert a.rank(method="min").equal(a.rank(method="max")) assert a.rank(method="min").equal(a.rank(method="min")) assert a.rank(method="min").equal(a.rank(method="ordinal"))
def compare(self, other, *by, ignore_columns=[], max_changed=inf): """ Find differences against another data frame. `by` are identifier columns which are used to uniquely identify rows and match them between `self` and `other`. `compare` will not work if your data lacks suitable identifiers. `ignore_columns` is an optional list of columns, differences in which to ignore. `compare` returns three data frames: added rows, removed rows and changed values. The first two are basically subsets of the rows of `self` and `other`, respectively. Changed values are returned as a data frame with one row per differing value (not per differing row). Listing changes will terminate once `max_changed` is reached. .. warning:: `compare` is experimental, do not rely on it reporting all of the differences correctly. Do not try to give it two huge data frames with very little in common, unless also giving some sensible value for `max_changed`. >>> old = di.read_csv("data/vehicles.csv") >>> new = old.modify(hwy=lambda x: np.minimum(100, x.hwy)) >>> added, removed, changed = new.compare(old, "id") >>> changed """ if self.unique(*by).nrow < self.nrow: raise ValueError(f"self not unique by {by}") if other.unique(*by).nrow < other.nrow: raise ValueError(f"other not unique by {by}") added = self.anti_join(other, *by) removed = other.anti_join(self, *by) x = self.modify(_i_=range(self.nrow)) y = other.modify(_j_=range(other.nrow)) z = x.inner_join(y.select("_j_", *by), *by) colnames = util.unique_keys(self.colnames + other.colnames) colnames = [x for x in colnames if x not in ignore_columns] changed = [] for i, j in zip(z._i_, z._j_): if len(changed) >= max_changed: print(f"max_changed={max_changed} reached, terminating") break for colname in colnames: if len(changed) >= max_changed: break # XXX: How to make a distinction between # a missing column and a missing value? xvalue = x[colname][i] if colname in x else None yvalue = y[colname][j] if colname in y else None if (xvalue != yvalue and not Vector([xvalue, yvalue]).is_na().all()): # XXX: We could have a name clash here. byrow = {k: x[k][i] for k in by} changed.append(dict(**byrow, column=colname, xvalue=xvalue, yvalue=yvalue)) added = added if added.nrow > 0 else None removed = removed if removed.nrow > 0 else None changed = self.from_json(changed) if changed else None return added, removed, changed
def filter_out(self, rows=None, **colname_value_pairs): """ Return rows that don't match condition. Filtering can be done by either `rows` or `colname_value_pairs`. `rows` can be either a boolean vector or a function that receives the data frame as argument and returns a boolean vector. The latter is especially useful in a method chaining context where you don't have direct access to the data frame in question. Alternatively, `colname_value_pairs` provides a shorthand to check against a fixed value. See the example below of equivalent filtering all three ways. >>> data = di.read_csv("data/listings.csv") >>> data.filter_out(data.hood == "Manhattan") >>> data.filter_out(lambda x: x.hood == "Manhattan") >>> data.filter_out(hood="Manhattan") """ if rows is not None: if callable(rows): rows = rows(self) elif colname_value_pairs: rows = Vector.fast([True], bool).repeat(self.nrow) for colname, value in colname_value_pairs.items(): rows = rows & (self[colname] == value) rows = self._parse_rows_from_boolean(rows) for colname, column in self.items(): yield colname, np.delete(column, rows)
def test_is_string(self): assert not Vector([b"1"]).is_string() assert not Vector([True]).is_string() assert not Vector([1]).is_string() assert not Vector([1.1]).is_string() assert Vector(["a"]).is_string() assert not Vector([DATE]).is_string() assert not Vector([DATETIME]).is_string() assert not Vector([self]).is_string()
def test_is_integer(self): assert not Vector([b"1"]).is_integer() assert not Vector([True]).is_integer() assert Vector([1]).is_integer() assert not Vector([1.1]).is_integer() assert not Vector(["a"]).is_integer() assert not Vector([DATE]).is_integer() assert not Vector([DATETIME]).is_integer() assert not Vector([self]).is_integer()
def test_is_datetime(self): assert not Vector([b"1"]).is_datetime() assert not Vector([True]).is_datetime() assert not Vector([1]).is_datetime() assert not Vector([1.1]).is_datetime() assert not Vector(["a"]).is_datetime() assert Vector([DATE]).is_datetime() assert Vector([DATETIME]).is_datetime() assert not Vector([self]).is_datetime()
def test_is_boolean(self): assert not Vector([b"1"]).is_boolean() assert Vector([True]).is_boolean() assert not Vector([1]).is_boolean() assert not Vector([1.1]).is_boolean() assert not Vector(["a"]).is_boolean() assert not Vector([DATE]).is_boolean() assert not Vector([DATETIME]).is_boolean() assert not Vector([self]).is_boolean()
def get_part(data, colname): if colname in data: return data[colname] for ref in data_frames: if colname not in ref: continue value = ref[colname].na_value dtype = ref[colname].na_dtype return Vector.fast([value], dtype).repeat(data.nrow)
def test___new___list_of_numpy(self): assert Vector([np.bool_(True)]).is_boolean() assert Vector([np.datetime64(DATE)]).is_datetime() assert Vector([np.datetime64(DATETIME)]).is_datetime() assert Vector([np.float_(0.5)]).is_float() assert Vector([np.int_(1)]).is_integer() assert Vector([np.object_(np)]).is_object() assert Vector([np.str_("")]).is_string()
def test_aggregate(self, function, input, output, use_numba): if use_numba and not dataiter.USE_NUMBA: pytest.skip("No Numba") with patch("dataiter.USE_NUMBA", use_numba): data = DataFrame(g=GROUPS, a=input) stat = data.group_by("g").aggregate(a=function("a")) expected = Vector(output) try: assert stat.a.equal(expected) except AssertionError: print("") print(data) print("Expected:") print(expected) print("Got:") print(stat.a) raise
def test_quantile_nan(self): assert np.isnan(quantile(EMPTY_VECTOR, 0.5)) assert np.isnan(quantile(Vector([1, 4, NaN]), 0.5, drop_na=False))
def _parse_rows_from_integer(self, rows): return Vector.fast(rows, int)
def _parse_rows_from_boolean(self, rows): rows = Vector.fast(rows, bool) if len(rows) != self.nrow: raise ValueError("Bad length for boolean rows") return Vector.fast(np.nonzero(rows)[0], int)
def _parse_cols_from_integer(self, cols): return Vector.fast(cols, int)
def _parse_cols_from_boolean(self, cols): cols = Vector.fast(cols, bool) if len(cols) != self.ncol: raise ValueError("Bad length for boolean cols") return Vector.fast(np.nonzero(cols)[0], int)
def test_quantile(self): assert quantile(Vector([1, 4, 6, 8, 5]), 0.5) == 5 assert quantile(Vector([1, 4, 6, NaN, NaN]), 0.5) == 4
def test_std(self): assert np.isclose(std(Vector([3, 6, 7])), 1.699673) assert np.isclose(std(Vector([3, 6, NaN])), 1.5)
def test_var_nan(self): assert np.isnan(var(EMPTY_VECTOR)) assert np.isnan(var(Vector([1]))) assert np.isnan(var(Vector([3, 6, NaN]), drop_na=False))
def test_var(self): assert np.isclose(var(Vector([3, 6, 7])), 2.888889) assert np.isclose(var(Vector([3, 6, NaN])), 2.25)
def test_sum_nan(self): assert np.isnan(sum(Vector([1, 2, NaN]), drop_na=False))
def test_sum(self): assert sum(EMPTY_VECTOR) == 0 assert sum(Vector([1])) == 1 assert sum(Vector([1, 2])) == 3 assert sum(Vector([1, 2, NaN])) == 3