Example #1
0
 def test_count_unique(self):
     assert count_unique(EMPTY_VECTOR) == 0
     assert count_unique(Vector([1])) == 1
     assert count_unique(Vector([1, 1])) == 1
     assert count_unique(Vector([1, 1, 2])) == 2
     assert count_unique(Vector([1, 1, 2, NaN])) == 3
     assert count_unique(Vector([1, 1, 2, NaN]), drop_na=True) == 2
Example #2
0
 def test___new___na_datetime(self):
     # Should be converted to np.datetime64.
     # Missing values should be NaT.
     a = Vector([DATETIME, NaN, NaT, None])
     b = Vector([DATETIME, NaT, NaT, NaT])
     assert a.is_datetime()
     assert a.equal(b)
Example #3
0
 def test___new___na_integer(self):
     # Should be upcast to float.
     # Missing values should be NaN.
     a = Vector([1, 2, NaN, None])
     b = Vector([1, 2, NaN, NaN])
     assert a.is_float()
     assert a.equal(b)
Example #4
0
 def test_as_string_length(self):
     a = Vector([""]).as_string()
     a[0] = "hello"
     assert a[0] == "h"
     a = Vector([""]).as_string(5)
     a[0] = "hello"
     assert a[0] == "hello"
Example #5
0
 def test___new___na_boolean(self):
     # Should be upcast to object.
     # Missing values should be None.
     a = Vector([True, False, NaN, None])
     b = Vector([True, False, None, None])
     assert a.is_object()
     assert a.equal(b)
Example #6
0
 def test_fast_iterator(self):
     a = Vector.fast(list(range(10)))
     b = Vector.fast(range(10))
     c = Vector.fast(x for x in range(10))
     d = Vector.fast(map(lambda x: x, range(10)))
     assert a.equal(b)
     assert a.equal(c)
     assert a.equal(d)
Example #7
0
 def __new__(cls, object, dtype=None, nrow=None):
     object = cls._sequencify(object)
     column = Vector(object, dtype)
     if nrow is not None and nrow != column.length:
         if column.length != 1 or nrow < 1:
             raise ValueError("Bad arguments for broadcast")
         column = column.repeat(nrow)
     return column.view(cls)
Example #8
0
 def test___new___na_object(self):
     # Missing values should be None.
     a = Vector(["a", "b", "", NaN, None], object)
     assert a.is_object()
     assert a[0] == "a"
     assert a[1] == "b"
     assert a[2] == ""
     assert a[3] is None
     assert a[4] is None
Example #9
0
    def aggregate(self, **colname_function_pairs):
        """
        Return group-wise calculated summaries.

        Usually aggregation is preceded by grouping, which can be conveniently
        written via method chaining as ``data.group_by(...).aggregate(...)``.

        In `colname_function_pairs`, `function` receives as an argument a data
        frame object, a group-wise subset of all rows. It should return a
        scalar value. Common aggregation functions have shorthand helpers
        available under :mod:`dataiter`, see the guide on :doc:`aggregation
        </aggregation>` for details.

        >>> data = di.read_csv("data/listings.csv")
        >>> # The below aggregations are identical. Usually you'll get by
        >>> # with the shorthand helpers, but for complicated calculations,
        >>> # you might need custom lambda functions.
        >>> data.group_by("hood").aggregate(n=di.count(), price=di.mean("price"))
        >>> data.group_by("hood").aggregate(n=lambda x: x.nrow, price=lambda x: x.price.mean())
        """
        group_colnames = self._group_colnames
        data = self.sort(**dict.fromkeys(group_colnames, 1))
        data._index_ = np.arange(data.nrow)
        stat = data.unique(*group_colnames).select("_index_", *group_colnames)
        indices = np.split(data._index_, stat._index_[1:])
        group_aware = [getattr(x, "group_aware", False) for x in colname_function_pairs.values()]
        if any(group_aware):
            groups = Vector.fast(range(len(indices)), int)
            n = Vector.fast(map(len, indices), int)
            data._group_ = np.repeat(groups, n)
        slices = None
        for colname, function in colname_function_pairs.items():
            if getattr(function, "group_aware", False):
                # function might leave Nones in its output,
                # once those are replaced with the proper default
                # we can do a fast conversion to DataFrameColumn.
                column = function(data)
                default = function.default
                for i in range(len(column)):
                    if column[i] is None:
                        column[i] = default
                assert len(column) == stat.nrow
                column = DataFrameColumn.fast(column)
                stat[colname] = column
            else:
                # When using an arbitrary function, we cannot know
                # what special values to expect and thus we end up
                # needing to use the slow Vector.__init__.
                if slices is None:
                    slices = [data._view_rows(x) for x in indices]
                stat[colname] = [function(x) for x in slices]
        return stat.unselect("_index_", "_group_")
Example #10
0
 def test_rank_without_ties(self):
     # Without ties, all methods should give the same result.
     a = Vector([NaN, 3, 2, 4, 5, 1])
     assert a.rank(method="min").equal(a.rank(method="average"))
     assert a.rank(method="min").equal(a.rank(method="max"))
     assert a.rank(method="min").equal(a.rank(method="min"))
     assert a.rank(method="min").equal(a.rank(method="ordinal"))
Example #11
0
    def compare(self, other, *by, ignore_columns=[], max_changed=inf):
        """
        Find differences against another data frame.

        `by` are identifier columns which are used to uniquely identify rows
        and match them between `self` and `other`. `compare` will not work if
        your data lacks suitable identifiers. `ignore_columns` is an optional
        list of columns, differences in which to ignore.

        `compare` returns three data frames: added rows, removed rows and
        changed values. The first two are basically subsets of the rows of
        `self` and `other`, respectively. Changed values are returned as a data
        frame with one row per differing value (not per differing row). Listing
        changes will terminate once `max_changed` is reached.

        .. warning:: `compare` is experimental, do not rely on it reporting all
                     of the differences correctly. Do not try to give it two
                     huge data frames with very little in common, unless also
                     giving some sensible value for `max_changed`.

        >>> old = di.read_csv("data/vehicles.csv")
        >>> new = old.modify(hwy=lambda x: np.minimum(100, x.hwy))
        >>> added, removed, changed = new.compare(old, "id")
        >>> changed
        """
        if self.unique(*by).nrow < self.nrow:
            raise ValueError(f"self not unique by {by}")
        if other.unique(*by).nrow < other.nrow:
            raise ValueError(f"other not unique by {by}")
        added = self.anti_join(other, *by)
        removed = other.anti_join(self, *by)
        x = self.modify(_i_=range(self.nrow))
        y = other.modify(_j_=range(other.nrow))
        z = x.inner_join(y.select("_j_", *by), *by)
        colnames = util.unique_keys(self.colnames + other.colnames)
        colnames = [x for x in colnames if x not in ignore_columns]
        changed = []
        for i, j in zip(z._i_, z._j_):
            if len(changed) >= max_changed:
                print(f"max_changed={max_changed} reached, terminating")
                break
            for colname in colnames:
                if len(changed) >= max_changed: break
                # XXX: How to make a distinction between
                # a missing column and a missing value?
                xvalue = x[colname][i] if colname in x else None
                yvalue = y[colname][j] if colname in y else None
                if (xvalue != yvalue and
                    not Vector([xvalue, yvalue]).is_na().all()):
                    # XXX: We could have a name clash here.
                    byrow = {k: x[k][i] for k in by}
                    changed.append(dict(**byrow,
                                        column=colname,
                                        xvalue=xvalue,
                                        yvalue=yvalue))

        added = added if added.nrow > 0 else None
        removed = removed if removed.nrow > 0 else None
        changed = self.from_json(changed) if changed else None
        return added, removed, changed
Example #12
0
    def filter_out(self, rows=None, **colname_value_pairs):
        """
        Return rows that don't match condition.

        Filtering can be done by either `rows` or `colname_value_pairs`. `rows`
        can be either a boolean vector or a function that receives the data
        frame as argument and returns a boolean vector. The latter is
        especially useful in a method chaining context where you don't have
        direct access to the data frame in question. Alternatively,
        `colname_value_pairs` provides a shorthand to check against a fixed
        value. See the example below of equivalent filtering all three ways.

        >>> data = di.read_csv("data/listings.csv")
        >>> data.filter_out(data.hood == "Manhattan")
        >>> data.filter_out(lambda x: x.hood == "Manhattan")
        >>> data.filter_out(hood="Manhattan")
        """
        if rows is not None:
            if callable(rows):
                rows = rows(self)
        elif colname_value_pairs:
            rows = Vector.fast([True], bool).repeat(self.nrow)
            for colname, value in colname_value_pairs.items():
                rows = rows & (self[colname] == value)
        rows = self._parse_rows_from_boolean(rows)
        for colname, column in self.items():
            yield colname, np.delete(column, rows)
Example #13
0
 def test_is_string(self):
     assert not Vector([b"1"]).is_string()
     assert not Vector([True]).is_string()
     assert not Vector([1]).is_string()
     assert not Vector([1.1]).is_string()
     assert Vector(["a"]).is_string()
     assert not Vector([DATE]).is_string()
     assert not Vector([DATETIME]).is_string()
     assert not Vector([self]).is_string()
Example #14
0
 def test_is_integer(self):
     assert not Vector([b"1"]).is_integer()
     assert not Vector([True]).is_integer()
     assert Vector([1]).is_integer()
     assert not Vector([1.1]).is_integer()
     assert not Vector(["a"]).is_integer()
     assert not Vector([DATE]).is_integer()
     assert not Vector([DATETIME]).is_integer()
     assert not Vector([self]).is_integer()
Example #15
0
 def test_is_datetime(self):
     assert not Vector([b"1"]).is_datetime()
     assert not Vector([True]).is_datetime()
     assert not Vector([1]).is_datetime()
     assert not Vector([1.1]).is_datetime()
     assert not Vector(["a"]).is_datetime()
     assert Vector([DATE]).is_datetime()
     assert Vector([DATETIME]).is_datetime()
     assert not Vector([self]).is_datetime()
Example #16
0
 def test_is_boolean(self):
     assert not Vector([b"1"]).is_boolean()
     assert Vector([True]).is_boolean()
     assert not Vector([1]).is_boolean()
     assert not Vector([1.1]).is_boolean()
     assert not Vector(["a"]).is_boolean()
     assert not Vector([DATE]).is_boolean()
     assert not Vector([DATETIME]).is_boolean()
     assert not Vector([self]).is_boolean()
Example #17
0
 def get_part(data, colname):
     if colname in data:
         return data[colname]
     for ref in data_frames:
         if colname not in ref: continue
         value = ref[colname].na_value
         dtype = ref[colname].na_dtype
         return Vector.fast([value], dtype).repeat(data.nrow)
Example #18
0
 def test___new___list_of_numpy(self):
     assert Vector([np.bool_(True)]).is_boolean()
     assert Vector([np.datetime64(DATE)]).is_datetime()
     assert Vector([np.datetime64(DATETIME)]).is_datetime()
     assert Vector([np.float_(0.5)]).is_float()
     assert Vector([np.int_(1)]).is_integer()
     assert Vector([np.object_(np)]).is_object()
     assert Vector([np.str_("")]).is_string()
Example #19
0
 def test_aggregate(self, function, input, output, use_numba):
     if use_numba and not dataiter.USE_NUMBA:
         pytest.skip("No Numba")
     with patch("dataiter.USE_NUMBA", use_numba):
         data = DataFrame(g=GROUPS, a=input)
         stat = data.group_by("g").aggregate(a=function("a"))
         expected = Vector(output)
         try:
             assert stat.a.equal(expected)
         except AssertionError:
             print("")
             print(data)
             print("Expected:")
             print(expected)
             print("Got:")
             print(stat.a)
             raise
Example #20
0
 def test_quantile_nan(self):
     assert np.isnan(quantile(EMPTY_VECTOR, 0.5))
     assert np.isnan(quantile(Vector([1, 4, NaN]), 0.5, drop_na=False))
Example #21
0
 def _parse_rows_from_integer(self, rows):
     return Vector.fast(rows, int)
Example #22
0
 def _parse_rows_from_boolean(self, rows):
     rows = Vector.fast(rows, bool)
     if len(rows) != self.nrow:
         raise ValueError("Bad length for boolean rows")
     return Vector.fast(np.nonzero(rows)[0], int)
Example #23
0
 def _parse_cols_from_integer(self, cols):
     return Vector.fast(cols, int)
Example #24
0
 def _parse_cols_from_boolean(self, cols):
     cols = Vector.fast(cols, bool)
     if len(cols) != self.ncol:
         raise ValueError("Bad length for boolean cols")
     return Vector.fast(np.nonzero(cols)[0], int)
Example #25
0
 def test_quantile(self):
     assert quantile(Vector([1, 4, 6, 8, 5]), 0.5) == 5
     assert quantile(Vector([1, 4, 6, NaN, NaN]), 0.5) == 4
Example #26
0
 def test_std(self):
     assert np.isclose(std(Vector([3, 6, 7])), 1.699673)
     assert np.isclose(std(Vector([3, 6, NaN])), 1.5)
Example #27
0
 def test_var_nan(self):
     assert np.isnan(var(EMPTY_VECTOR))
     assert np.isnan(var(Vector([1])))
     assert np.isnan(var(Vector([3, 6, NaN]), drop_na=False))
Example #28
0
 def test_var(self):
     assert np.isclose(var(Vector([3, 6, 7])), 2.888889)
     assert np.isclose(var(Vector([3, 6, NaN])), 2.25)
Example #29
0
 def test_sum_nan(self):
     assert np.isnan(sum(Vector([1, 2, NaN]), drop_na=False))
Example #30
0
 def test_sum(self):
     assert sum(EMPTY_VECTOR) == 0
     assert sum(Vector([1])) == 1
     assert sum(Vector([1, 2])) == 3
     assert sum(Vector([1, 2, NaN])) == 3