def test_from_strings(): s1 = nvstrings.to_device(["dog and cat", None, "accénted", ""]) got = nvstrings.from_strings(s1, s1) expected = [ 'dog and cat', None, 'accénted', '', 'dog and cat', None, 'accénted', '' ] assert got.to_host() == expected
def ngrams(strs, N=2, sep='_'): """Generate the n-grams of an nvstrings array. Parameters ---------- strs : nvstrings The strings for this operation. N : int The degree of the n-gram (number of consecutive tokens). Default of 2 for bigrams. sep : The separator to use between within an n-gram. Default of '_'. Returns ------- ngrams_object : nvstrings Examples -------- >>> import nvstrings, nvtext >>> dstrings = nvstrings.to_device(['this is my', 'favorite book']) >>> print(nvtext.ngrams(dstrings, N=2, sep='_')) ['this_is', 'is_my', 'my_favorite', 'favorite_book'] """ logging.warning("ngrams functionlity does not currently scale " "well to large datasets.") # Tokenize tokens = strs.split_record() tokens_combined = nvs.from_strings(tokens) pad = nvs.to_device(['']) ngram_object = tokens_combined total_num_of_tokens = tokens_combined.size() shifted_token_collection = [] # Create shifted and padded nvstrings objects for i in range(N - 1): shifted_tokens = tokens_combined.remove_strings( list(range(0, i + 1)) ) shifted_tokens = shifted_tokens.add_strings( [pad] * (total_num_of_tokens - shifted_tokens.size()) ) shifted_token_collection.append(shifted_tokens) # Create the n-grams from the shifted nvstrings for sequence in shifted_token_collection: ngram_object = ngram_object.cat(sequence, sep) ngram_object = ngram_object.remove_strings( list(range(ngram_object.size() - N + 1, ngram_object.size())) ) return ngram_object
def _concat(cls, objs, dtype=None): from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn if len(objs) == 0: if pd.api.types.is_categorical_dtype(dtype): return CategoricalColumn(data=Column( Buffer.null(np.dtype('int8'))), null_count=0, ordered=False) elif dtype == np.dtype('object'): return StringColumn(data=nvstrings.to_device([]), null_count=0) else: dtype = np.dtype(dtype) return Column(Buffer.null(dtype)) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): new_cats = tuple(set([val for o in objs for val in o])) objs = [o.cat()._set_categories(new_cats) for o in objs] head = objs[0] for o in objs: if not o.is_type_equivalent(head): raise ValueError("All series must be of same type") # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _gdf._column_concat(objs, col) return col
def _concat(cls, objs, dtype=None): from cudf.core.series import Series from cudf.core.column import ( StringColumn, CategoricalColumn, NumericalColumn, ) if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and ( len( [ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ] ) == 0 ): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.core.column import column_empty_like objs[i] = column_empty_like( head, dtype=head.dtype, masked=True, newsize=len(obj) ) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = ( Series(ColumnBase._concat([o.categories for o in objs])) .drop_duplicates() ._column ) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.dtype == head.dtype): raise ValueError("All series must be of same type") newsize = sum(map(len, objs)) if newsize > libcudfxx.MAX_COLUMN_SIZE: raise MemoryError( "Result of concat cannot have " "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR) ) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): result_nbytes = sum(o._nbytes for o in objs) if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Result of concat cannot have > {} bytes".format( libcudfxx.MAX_STRING_COLUMN_BYTES_STR ) ) objs = [o.nvstrings for o in objs] return as_column(nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = any(col.nullable for col in objs) if is_categorical_dtype(head): data_dtype = head.codes.dtype data = None children = (column_empty(newsize, dtype=head.codes.dtype),) else: data_dtype = head.dtype data = Buffer.empty(size=newsize * data_dtype.itemsize) children = () # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = build_column( data=data, dtype=head.dtype, mask=mask, children=children ) # Performance the actual concatenation if newsize > 0: col = libcudf.concat._column_concat(objs, col) return col
# import nvstrings # s1 = nvstrings.to_device(["defghi",None,"jkl","dog and cat","accénted",""]) print("s1",s1) print("s1,s1,s1",nvstrings.from_strings(s1,s1,s1)) s2 = nvstrings.to_device(["aaa",None,"","bbb"]) print("s2",s2) print("s1.add_strings(s2)",s1.add_strings(s2)) print("s1.copy()",s1.copy())
def _concat(cls, objs, dtype=None): from cudf.dataframe.series import Series from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.numerical import NumericalColumn if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if dtype.type in (np.object_, np.str_): return StringColumn(data=nvstrings.to_device([]), null_count=0) elif is_categorical_dtype(dtype): return CategoricalColumn( data=Column(Buffer.null(np.dtype("int8"))), null_count=0, ordered=False, ) else: return Column(Buffer.null(dtype)) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not objs[i].is_type_equivalent(head): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.dataframe.columnops import column_empty_like objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = (Series(Column._concat([o.categories for o in objs ])).drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.is_type_equivalent(head)): raise ValueError("All series must be of same type") # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _column_concat(objs, col) return col