Ejemplo n.º 1
0
def test_merge_category():
    strs1 = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    strs2 = nvstrings.to_device(
        ["ggg", "fff", "hhh", "aaa", "fff", "fff", "ggg", "hhh", "bbb"])
    cat1 = nvcategory.from_strings(strs1)
    cat2 = nvcategory.from_strings(strs2)
    ncat = cat1.merge_category(cat2)

    expected_keys = ["aaa", "ccc", "ddd", "eee", "bbb", "fff", "ggg", "hhh"]
    expected_values = [3, 0, 3, 2, 1, 1, 1, 3, 0, 6, 5, 7, 0, 5, 5, 6, 7, 4]
    assert_eq(ncat.keys(), expected_keys)
    assert_eq(ncat.values(), expected_values)
Ejemplo n.º 2
0
def test_merge_and_remap():
    strs1 = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    strs2 = nvstrings.to_device(
        ["ggg", "fff", "hhh", "aaa", "fff", "fff", "ggg", "hhh", "bbb"])
    cat1 = nvcategory.from_strings(strs1)
    cat2 = nvcategory.from_strings(strs2)
    ncat = cat1.merge_and_remap(cat2)

    expected_keys = ['aaa', 'bbb', 'ccc', 'ddd', 'eee', 'fff', 'ggg', 'hhh']
    expected_values = [4, 0, 4, 3, 2, 2, 2, 4, 0, 6, 5, 7, 0, 5, 5, 6, 7, 1]
    assert_eq(ncat.keys(), expected_keys)
    assert_eq(ncat.values(), expected_values)
Ejemplo n.º 3
0
def test_gather_index_exception(func):
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    indexes = [0, 2, 0, 4]
    with pytest.raises(Exception):
        func(cat, indexes)
Ejemplo n.º 4
0
    def unique(self, method="sort"):
        """
        Get unique strings in the data
        """
        import nvcategory as nvc

        return StringColumn(nvc.from_strings(self.data).keys())
Ejemplo n.º 5
0
 def transform(self, y: cudf.Series) -> cudf.Series:
     self._check_is_fitted()
     y = _enforce_str(y)
     encoded = cudf.Series(
         nvcategory.from_strings(y.data).set_keys(
             self._cats.keys()).values())
     return encoded.replace(-1, 0)
Ejemplo n.º 6
0
def test_value_for_index():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    got = cat.value_for_index(7)
    expected = 3
    assert got == expected
Ejemplo n.º 7
0
def test_indexes_for_key():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    got = cat.indexes_for_key('ccc')
    expected = [4, 5, 6]
    assert_eq(got, expected)
Ejemplo n.º 8
0
def test_value():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    got = cat.value('ccc')
    expected = 1
    assert got == expected
Ejemplo n.º 9
0
    def fit_transform(self, y: cudf.Series) -> cudf.Series:
        """
        Simultaneously fit and transform an input

        This is functionally equivalent to (but faster than)
        `LabelEncoder().fit(y).transform(y)`
        """
        self._dtype = y.dtype

        # Convert y to nvstrings series, if it isn't one
        y = _enforce_str(y)

        # Bottleneck is here, despite everything being done on the device

        nvs = _get_nvstring_from_series(y)

        if nvs is not None:
            self._cats = nvcategory.from_strings(nvs)
        else:
            self._cats = {}

        self._fitted = True
        arr: rmm.device_array = rmm.device_array(
            len(y), dtype=np.int32
        )

        if nvs is not None:
            self._cats.values(devptr=arr.device_ctypes_pointer.value)
        return cudf.Series(arr)
Ejemplo n.º 10
0
def test_gather_strings():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    got = cat.gather_strings([0, 2, 0])
    expected = ['aaa', 'ddd', 'aaa']
    assert_eq(got, expected)
Ejemplo n.º 11
0
    def transform(self, y: cudf.Series) -> cudf.Series:
        """
        Transform an input into its categorical keys.

        This is intended for use with small inputs relative to the size of the
        dataset. For fitting and transforming an entire dataset, prefer
        `fit_transform`.

        Parameters
        ----------
        y : cudf.Series
            Input keys to be transformed. Its values should match the
            categories given to `fit`

        Returns
        -------
        encoded : cudf.Series
            The ordinally encoded input series

        Raises
        ------
        KeyError
            if a category appears that was not seen in `fit`
        """
        self._check_is_fitted()
        y = _enforce_str(y)
        encoded = cudf.Series(
            nvcategory.from_strings(_get_nvstring_from_series(y))
            .set_keys(self._cats.keys())
            .values()
        )

        if encoded.isin([-1]).any():
            raise KeyError("Attempted to encode unseen key")
        return encoded
Ejemplo n.º 12
0
    def fit(self, y: cudf.Series) -> "LabelEncoder":
        """
        Fit a LabelEncoder (nvcategory) instance to a set of categories

        Parameters
        ----------
        y : cudf.Series
            Series containing the categories to be encoded. It's elements
            may or may not be unique

        Returns
        -------
        self : LabelEncoder
            A fitted instance of itself to allow method chaining
        """
        self._dtype = y.dtype

        y = _enforce_str(y)

        nvs = _get_nvstring_from_series(y)

        if nvs is not None:
            self._cats = nvcategory.from_strings(nvs)
        else:
            self._cats = {}

        self._fitted = True
        return self
Ejemplo n.º 13
0
def test_values():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    got = cat.values()
    expected = [3, 0, 3, 2, 1, 1, 1, 3, 0]
    assert_eq(got, expected)
Ejemplo n.º 14
0
def test_remove_unused_keys():
    strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"])
    strs2 = nvstrings.to_device(["b", "c", "e", "d"])
    cat = nvcategory.from_strings(strs1)
    cat1 = cat.set_keys(strs2)
    cat1_unused_removed = cat1.remove_unused_keys()
    assert_eq(cat1_unused_removed.keys(), ['b', 'c'])
Ejemplo n.º 15
0
    def fit(self, y: cudf.Series) -> "LabelEncoder":
        self._dtype = y.dtype

        y = _enforce_str(y)

        self._cats = nvcategory.from_strings(y.data)
        self._fitted = True
        return self
Ejemplo n.º 16
0
def test_gather_and_remap():
    strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"])
    cat = nvcategory.from_strings(strs1)
    cat1 = cat.gather_and_remap([1, 3, 2, 3, 1, 2])

    expected_keys = ['b', 'c', 'f']
    expected_values = [0, 2, 1, 2, 0, 1]
    assert_eq(cat1.keys(), expected_keys)
    assert_eq(cat1.values(), expected_values)
Ejemplo n.º 17
0
def test_gather():
    strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"])
    cat = nvcategory.from_strings(strs1)
    cat1 = cat.gather([1, 3, 2, 3, 1, 2])

    expected_keys = ["a", "b", "c", "f"]
    expected_values = [1, 3, 2, 3, 1, 2]
    assert_eq(cat1.keys(), expected_keys)
    assert_eq(cat1.values(), expected_values)
Ejemplo n.º 18
0
def test_add_strings():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    got = cat.add_strings(strs)
    expected_keys = ['aaa', 'ccc', 'ddd', 'eee']
    expected_values = [3, 0, 3, 2, 1, 1, 1, 3, 0, 3, 0, 3, 2, 1, 1, 1, 3, 0]
    assert_eq(got.keys(), expected_keys)
    assert_eq(got.values(), expected_values)
Ejemplo n.º 19
0
def setups(self, to: TabularGPU):
    self.lbls = {
        n: nvcategory.from_strings(_to_str(to.iloc[:, n]).data).keys()
        for n in to.all_cat_names
    }
    self.classes = {
        n: CategoryMap(_remove_none(c.to_host()), add_na=(n in to.cat_names))
        for n, c in self.lbls.items()
    }
Ejemplo n.º 20
0
def test_from_strings():
    strs1 = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    strs2 = nvstrings.to_device(
        ["ggg", "fff", "hhh", "aaa", "fff", "fff", "ggg", "hhh", "bbb"])
    cat = nvcategory.from_strings(strs1, strs2)

    expected_keys = ["aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg", "hhh"]
    expected_values = [4, 0, 4, 3, 2, 2, 2, 4, 0, 6, 5, 7, 0, 5, 5, 6, 7, 1]
    assert_eq(cat.keys(), expected_keys)
    assert_eq(cat.values(), expected_values)
Ejemplo n.º 21
0
def test_remove_strings():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    removal_strings = nvstrings.to_device(["ccc", "aaa", "bbb"])
    got = cat.remove_strings(removal_strings)

    expected_keys = ['ddd', 'eee']
    expected_values = [1, 1, 0, 1]
    assert_eq(got.keys(), expected_keys)
    assert_eq(got.values(), expected_values)
Ejemplo n.º 22
0
    def fit_transform(self, y: cudf.Series) -> cudf.Series:
        self._dtype = y.dtype

        # Convert y to nvstrings series, if it isn't one
        y = _enforce_str(y)

        # Bottleneck is here, despite everything being done on the device
        self._cats = nvcategory.from_strings(y.data)

        self._fitted = True
        arr: cp.array = cp.array(y.data.size(), dtype=np.int32)
        self._cats.values(devptr=arr.device_ctypes_pointer.value)
        return cudf.Series(arr)
Ejemplo n.º 23
0
def test_set_keys():
    strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"])
    strs2 = nvstrings.to_device(["b", "c", "e", "d"])
    cat = nvcategory.from_strings(strs1)
    cat1 = cat.set_keys(strs2)
    assert_eq(cat1.keys(), ['b', 'c', 'd', 'e'])
Ejemplo n.º 24
0
def test_size():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    assert strs.size() == cat.size()
Ejemplo n.º 25
0
def test_remove_keys():
    strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"])
    strs2 = nvstrings.to_device(["b", "d"])
    cat = nvcategory.from_strings(strs1)
    cat1 = cat.remove_keys(strs2)
    assert_eq(cat1.keys(), ['a', 'c', 'f'])
Ejemplo n.º 26
0
def test_to_strings():
    strs = nvstrings.to_device(
        ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"])
    cat = nvcategory.from_strings(strs)
    got = cat.to_strings()
    assert_eq(got, strs)
Ejemplo n.º 27
0
#
import nvstrings, nvcategory

# create
strs = nvstrings.to_device(["eee","aaa","eee","ddd","ccc","ccc","ccc","eee","aaa"])
print(strs.size(),strs)
cat = nvcategory.from_strings(strs)
print(cat.size(),cat)

print(".values():",cat.values())
print(".value_for_index(7)",cat.value_for_index(7))
print(".value(ccc):",cat.value('ccc'))
print(".indexes_for_key(ccc):",cat.indexes_for_key('ccc'))
print(".to_strings():",cat.to_strings())

# add
print("-------------------------")
print("add strings:")
strs = nvstrings.to_device(["ggg","fff","hhh","aaa","fff","fff","ggg","hhh","bbb"])
print(strs.size(),strs)
cat = cat.add_strings(strs)
print(cat.size(),cat.keys())

print(".values():",cat.values())
print(".value_for_index(7)",cat.value_for_index(7))
print(".value(aaa):",cat.value('aaa'))
print(".indexes_for_key(aaa):",cat.indexes_for_key('aaa'))
print(".to_strings():",cat.to_strings())
print(".gather_strings([0,2,0]):",cat.gather_strings([0,2,0]))

# remove
Ejemplo n.º 28
0
def test_keys_size():
    strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"])
    cat = nvcategory.from_strings(strs1)
    got = cat.keys_size()
    assert got == 4
Ejemplo n.º 29
0
 def nvcategory(self):
     if self._nvcategory is None:
         import nvcategory as nvc
         self._nvcategory = nvc.from_strings(self.data)
     return self._nvcategory
Ejemplo n.º 30
0
def test_keys():
    strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"])
    cat = nvcategory.from_strings(strs1)
    got = cat.keys()
    expected = ['a', 'b', 'c', 'f']
    assert_eq(got, expected)