Example #1
0
def test_from_strings():
    s1 = nvstrings.to_device(["dog and cat", None, "accénted", ""])
    got = nvstrings.from_strings(s1, s1)
    expected = [
        'dog and cat', None, 'accénted', '', 'dog and cat', None, 'accénted',
        ''
    ]
    assert got.to_host() == expected
Example #2
0
def ngrams(strs, N=2, sep='_'):
    """Generate the n-grams of an nvstrings array.

    Parameters
    ----------
    strs : nvstrings
        The strings for this operation.
    N : int
        The degree of the n-gram (number of consecutive tokens). Default of 2
        for bigrams.
    sep : The separator to use between within an n-gram. Default of '_'.

    Returns
    -------
    ngrams_object : nvstrings

    Examples
    --------
    >>> import nvstrings, nvtext
    >>> dstrings = nvstrings.to_device(['this is my', 'favorite book'])
    >>> print(nvtext.ngrams(dstrings, N=2, sep='_'))
    ['this_is', 'is_my', 'my_favorite', 'favorite_book']
    """
    logging.warning("ngrams functionlity does not currently scale "
                    "well to large datasets.")

    # Tokenize
    tokens = strs.split_record()
    tokens_combined = nvs.from_strings(tokens)

    pad = nvs.to_device([''])
    ngram_object = tokens_combined
    total_num_of_tokens = tokens_combined.size()
    shifted_token_collection = []

    # Create shifted and padded nvstrings objects
    for i in range(N - 1):
        shifted_tokens = tokens_combined.remove_strings(
            list(range(0, i + 1))
        )
        shifted_tokens = shifted_tokens.add_strings(
            [pad] * (total_num_of_tokens - shifted_tokens.size())
        )
        shifted_token_collection.append(shifted_tokens)

    # Create the n-grams from the shifted nvstrings
    for sequence in shifted_token_collection:
        ngram_object = ngram_object.cat(sequence, sep)

    ngram_object = ngram_object.remove_strings(
        list(range(ngram_object.size() - N + 1, ngram_object.size()))
    )
    return ngram_object
Example #3
0
    def _concat(cls, objs, dtype=None):
        from cudf.dataframe.string import StringColumn
        from cudf.dataframe.categorical import CategoricalColumn

        if len(objs) == 0:
            if pd.api.types.is_categorical_dtype(dtype):
                return CategoricalColumn(data=Column(
                    Buffer.null(np.dtype('int8'))),
                                         null_count=0,
                                         ordered=False)
            elif dtype == np.dtype('object'):
                return StringColumn(data=nvstrings.to_device([]), null_count=0)
            else:
                dtype = np.dtype(dtype)
                return Column(Buffer.null(dtype))

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            objs = [o._data for o in objs]
            return StringColumn(data=nvstrings.from_strings(*objs))

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            new_cats = tuple(set([val for o in objs for val in o]))
            objs = [o.cat()._set_categories(new_cats) for o in objs]

        head = objs[0]
        for o in objs:
            if not o.is_type_equivalent(head):
                raise ValueError("All series must be of same type")
        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = sum(o.null_count for o in objs)
        newsize = sum(map(len, objs))
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem, size=newsize)

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = head.replace(data=data, mask=mask, null_count=nulls)

        # Performance the actual concatenation
        if newsize > 0:
            col = _gdf._column_concat(objs, col)

        return col
Example #4
0
    def _concat(cls, objs, dtype=None):
        from cudf.core.series import Series
        from cudf.core.column import (
            StringColumn,
            CategoricalColumn,
            NumericalColumn,
        )

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if is_categorical_dtype(dtype):
                dtype = CategoricalDtype()
            return column_empty(0, dtype=dtype, masked=True)

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (
            len(
                [
                    o
                    for o in not_null_cols
                    if not isinstance(o, NumericalColumn)
                    or np.issubdtype(o.dtype, np.datetime64)
                ]
            )
            == 0
        ):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.core.column import column_empty_like

                    objs[i] = column_empty_like(
                        head, dtype=head.dtype, masked=True, newsize=len(obj)
                    )

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (
                Series(ColumnBase._concat([o.categories for o in objs]))
                .drop_duplicates()
                ._column
            )
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.dtype == head.dtype):
                raise ValueError("All series must be of same type")

        newsize = sum(map(len, objs))
        if newsize > libcudfxx.MAX_COLUMN_SIZE:
            raise MemoryError(
                "Result of concat cannot have "
                "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR)
            )

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            result_nbytes = sum(o._nbytes for o in objs)
            if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES:
                raise MemoryError(
                    "Result of concat cannot have > {}  bytes".format(
                        libcudfxx.MAX_STRING_COLUMN_BYTES_STR
                    )
                )
            objs = [o.nvstrings for o in objs]
            return as_column(nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = any(col.nullable for col in objs)

        if is_categorical_dtype(head):
            data_dtype = head.codes.dtype
            data = None
            children = (column_empty(newsize, dtype=head.codes.dtype),)
        else:
            data_dtype = head.dtype
            data = Buffer.empty(size=newsize * data_dtype.itemsize)
            children = ()

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = build_column(
            data=data, dtype=head.dtype, mask=mask, children=children
        )

        # Performance the actual concatenation
        if newsize > 0:
            col = libcudf.concat._column_concat(objs, col)

        return col
Example #5
0
#
import nvstrings

#
s1 = nvstrings.to_device(["defghi",None,"jkl","dog and cat","accénted",""])
print("s1",s1)
print("s1,s1,s1",nvstrings.from_strings(s1,s1,s1))

s2 = nvstrings.to_device(["aaa",None,"","bbb"])
print("s2",s2)
print("s1.add_strings(s2)",s1.add_strings(s2))

print("s1.copy()",s1.copy())
Example #6
0
    def _concat(cls, objs, dtype=None):
        from cudf.dataframe.series import Series
        from cudf.dataframe.string import StringColumn
        from cudf.dataframe.categorical import CategoricalColumn
        from cudf.dataframe.numerical import NumericalColumn

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if dtype.type in (np.object_, np.str_):
                return StringColumn(data=nvstrings.to_device([]), null_count=0)
            elif is_categorical_dtype(dtype):
                return CategoricalColumn(
                    data=Column(Buffer.null(np.dtype("int8"))),
                    null_count=0,
                    ordered=False,
                )
            else:
                return Column(Buffer.null(dtype))

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (len([
                o for o in not_null_cols if not isinstance(o, NumericalColumn)
                or np.issubdtype(o.dtype, np.datetime64)
        ]) == 0):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not objs[i].is_type_equivalent(head):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.dataframe.columnops import column_empty_like

                    objs[i] = column_empty_like(head,
                                                dtype=head.dtype,
                                                masked=True,
                                                newsize=len(obj))

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (Series(Column._concat([o.categories for o in objs
                                           ])).drop_duplicates()._column)
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.is_type_equivalent(head)):
                raise ValueError("All series must be of same type")

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            objs = [o._data for o in objs]
            return StringColumn(data=nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = sum(o.null_count for o in objs)
        newsize = sum(map(len, objs))
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem, size=newsize)

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = head.replace(data=data, mask=mask, null_count=nulls)

        # Performance the actual concatenation
        if newsize > 0:
            col = _column_concat(objs, col)

        return col