def test_fast_iterator(self): a = Vector.fast(list(range(10))) b = Vector.fast(range(10)) c = Vector.fast(x for x in range(10)) d = Vector.fast(map(lambda x: x, range(10))) assert a.equal(b) assert a.equal(c) assert a.equal(d)
def aggregate(self, **colname_function_pairs): """ Return group-wise calculated summaries. Usually aggregation is preceded by grouping, which can be conveniently written via method chaining as ``data.group_by(...).aggregate(...)``. In `colname_function_pairs`, `function` receives as an argument a data frame object, a group-wise subset of all rows. It should return a scalar value. Common aggregation functions have shorthand helpers available under :mod:`dataiter`, see the guide on :doc:`aggregation </aggregation>` for details. >>> data = di.read_csv("data/listings.csv") >>> # The below aggregations are identical. Usually you'll get by >>> # with the shorthand helpers, but for complicated calculations, >>> # you might need custom lambda functions. >>> data.group_by("hood").aggregate(n=di.count(), price=di.mean("price")) >>> data.group_by("hood").aggregate(n=lambda x: x.nrow, price=lambda x: x.price.mean()) """ group_colnames = self._group_colnames data = self.sort(**dict.fromkeys(group_colnames, 1)) data._index_ = np.arange(data.nrow) stat = data.unique(*group_colnames).select("_index_", *group_colnames) indices = np.split(data._index_, stat._index_[1:]) group_aware = [getattr(x, "group_aware", False) for x in colname_function_pairs.values()] if any(group_aware): groups = Vector.fast(range(len(indices)), int) n = Vector.fast(map(len, indices), int) data._group_ = np.repeat(groups, n) slices = None for colname, function in colname_function_pairs.items(): if getattr(function, "group_aware", False): # function might leave Nones in its output, # once those are replaced with the proper default # we can do a fast conversion to DataFrameColumn. column = function(data) default = function.default for i in range(len(column)): if column[i] is None: column[i] = default assert len(column) == stat.nrow column = DataFrameColumn.fast(column) stat[colname] = column else: # When using an arbitrary function, we cannot know # what special values to expect and thus we end up # needing to use the slow Vector.__init__. if slices is None: slices = [data._view_rows(x) for x in indices] stat[colname] = [function(x) for x in slices] return stat.unselect("_index_", "_group_")
def filter_out(self, rows=None, **colname_value_pairs): """ Return rows that don't match condition. Filtering can be done by either `rows` or `colname_value_pairs`. `rows` can be either a boolean vector or a function that receives the data frame as argument and returns a boolean vector. The latter is especially useful in a method chaining context where you don't have direct access to the data frame in question. Alternatively, `colname_value_pairs` provides a shorthand to check against a fixed value. See the example below of equivalent filtering all three ways. >>> data = di.read_csv("data/listings.csv") >>> data.filter_out(data.hood == "Manhattan") >>> data.filter_out(lambda x: x.hood == "Manhattan") >>> data.filter_out(hood="Manhattan") """ if rows is not None: if callable(rows): rows = rows(self) elif colname_value_pairs: rows = Vector.fast([True], bool).repeat(self.nrow) for colname, value in colname_value_pairs.items(): rows = rows & (self[colname] == value) rows = self._parse_rows_from_boolean(rows) for colname, column in self.items(): yield colname, np.delete(column, rows)
def get_part(data, colname): if colname in data: return data[colname] for ref in data_frames: if colname not in ref: continue value = ref[colname].na_value dtype = ref[colname].na_dtype return Vector.fast([value], dtype).repeat(data.nrow)
def _parse_rows_from_integer(self, rows): return Vector.fast(rows, int)
def _parse_rows_from_boolean(self, rows): rows = Vector.fast(rows, bool) if len(rows) != self.nrow: raise ValueError("Bad length for boolean rows") return Vector.fast(np.nonzero(rows)[0], int)
def _parse_cols_from_integer(self, cols): return Vector.fast(cols, int)
def _parse_cols_from_boolean(self, cols): cols = Vector.fast(cols, bool) if len(cols) != self.ncol: raise ValueError("Bad length for boolean cols") return Vector.fast(np.nonzero(cols)[0], int)
def to_string(self, *, max_rows=None, max_width=None): geometry = [f"<{x['type']}>" for x in self.geometry] data = self.modify(geometry=Vector.fast(geometry, object)) return DataFrame.to_string(data, max_rows, max_width)
def test_fast(self): a = Vector.fast([1, 2, 3], int) b = Vector([1, 2, 3], int) assert a.is_integer() assert a.equal(b)