def __getitem__(self, arg): from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance( arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)): if len(arg) == 0: arg = Series(np.array([], dtype='int32')) else: arg = Series(arg) if isinstance(arg, Series): if arg.dtype in [np.bool, np.bool_]: return self._sr.iloc[arg] # To do this efficiently we need a solution to # https://github.com/rapidsai/cudf/issues/1087 out = Series([], dtype=self._sr.dtype, index=self._sr.index.__class__([])) for s in arg: out = out.append(self._sr.loc[s:s], ignore_index=False) return out elif is_single_value(arg): found_index = self._sr.index.find_label_range(arg, None)[0] return self._sr.iloc[found_index] elif isinstance(arg, slice): start_index, stop_index = self._sr.index.find_label_range( arg.start, arg.stop) return self._sr.iloc[start_index:stop_index:arg.step] else: raise NotImplementedError( ".loc not implemented for label type {}".format( type(arg).__name__))
def _loc_to_iloc(self, arg): from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance( arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray) ): if len(arg) == 0: arg = Series(np.array([], dtype="int32")) else: arg = Series(arg) if isinstance(arg, Series): if arg.dtype in [np.bool, np.bool_]: return arg else: return indices_from_labels(self._sr, arg) elif is_scalar(arg): found_index = self._sr.index.find_label_range(arg, None)[0] return found_index elif isinstance(arg, slice): start_index, stop_index = self._sr.index.find_label_range( arg.start, arg.stop ) return slice(start_index, stop_index, arg.step) else: raise NotImplementedError( ".loc not implemented for label type {}".format( type(arg).__name__ ) )
def _compute_validity_mask(self, index, row_tuple, max_length): """ Computes the valid set of indices of values in the lookup """ from cudf import DataFrame from cudf import Series from cudf import concat from cudf.utils.cudautils import arange lookup = DataFrame() for idx, row in enumerate(row_tuple): if row == slice(None): continue lookup[index._source_data.columns[idx]] = Series(row) data_table = concat( [ index._source_data, DataFrame({"idx": Series(arange(len(index._source_data)))}), ], axis=1, ) result = lookup.merge(data_table)["idx"] # Avoid computing levels unless the result of the merge is empty, # which suggests that a KeyError should be raised. if len(result) == 0: for idx, row in enumerate(row_tuple): if row == slice(None): continue if row not in index.levels[idx]: raise KeyError(row) return result
def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): from cudf.utils.cudautils import arange from cudf import Series # Instructions for Slicing # if tuple, get first and last elements of tuple # if open beginning tuple, get 0 to highest valid_index # if open ending tuple, get highest valid_index to len() # if not open end or beginning, get range lowest beginning index # to highest ending index if isinstance(row_tuple, slice): if ( isinstance(row_tuple.start, numbers.Number) or isinstance(row_tuple.stop, numbers.Number) or row_tuple == slice(None) ): stop = row_tuple.stop or max_length start, stop, step = row_tuple.indices(stop) return arange(start, stop, step) start_values = self._compute_validity_mask( index, row_tuple.start, max_length ) stop_values = self._compute_validity_mask( index, row_tuple.stop, max_length ) return Series(arange(start_values.min(), stop_values.max() + 1)) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length)
def _get_column_major(self, df, row_tuple): from cudf import Series from cudf import DataFrame valid_indices = self._get_valid_indices_by_tuple( df.columns, row_tuple, len(df._cols) ) result = df._take_columns(valid_indices) if isinstance(row_tuple, (numbers.Number, slice)): row_tuple = [row_tuple] if len(result) == 0 and len(result.columns) == 0: result_columns = df.columns.copy(deep=False) clear_codes = DataFrame() for name in df.columns.names: clear_codes[name] = Series([]) result_columns._codes = clear_codes result_columns._source_data = clear_codes result.columns = result_columns elif len(row_tuple) < len(self.levels) and ( not slice(None) in row_tuple and not isinstance(row_tuple[0], slice) ): columns = self._popn(len(row_tuple)) result.columns = columns.take(valid_indices) else: result.columns = self.take(valid_indices) if len(result.columns.levels) == 1: columns = [] for code in result.columns.codes[result.columns.codes.columns[0]]: columns.append(result.columns.levels[0][code]) name = result.columns.names[0] result.columns = as_index(columns, name=name) return result
def _get_row_major(self, df, row_tuple): valid_indices = self._compute_validity_mask(df, row_tuple) from cudf import Series result = df.take(Series(valid_indices)) # Build new index - INDEX based MultiIndex # --------------- from cudf import DataFrame out_index = DataFrame() # Select the last n-k columns where n is the number of source # levels and k is the length of the indexing tuple for k in range(len(row_tuple), len(df.index.levels)): out_index.add_column(df.index.names[k], df.index.codes[df.index.codes.columns[k]]) # If there's only one column remaining in the output index, convert # it into a StringIndex and name the final index values according # to the proper codes. if len(out_index.columns) == 1: out_index = [] for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]: # noqa: E501 out_index.append(result.index.levels[ len(result.index.codes.columns)-1][val]) # TODO: Warning! The final index column could be arbitrarily # ordered integers, not Strings, so we need to check for that # dtype and produce a GenericIndex instead of a StringIndex out_index = StringIndex(out_index) out_index.name = result.index.names[len(result.index.names)-1] result.index = out_index else: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) if(len(out_index.columns)) > 0: result.reset_index(drop=True) result.index = result.index._popn(len(row_tuple)) return result
def _apply_op(self, fn, other=None): from cudf.dataframe.series import Series idx_series = Series(self) op = getattr(idx_series, fn) if other is not None: return as_index(op(other)) else: return as_index(op())
def normalize_chunks(self, size, chunks): if isinstance(chunks, six.integer_types): # *chunks* is the chunksize return cudautils.arange(0, size, chunks) else: # *chunks* is an array of chunk leading offset chunks = Series(chunks) return chunks.to_gpu_array()
def _sortjoin(self, other, how='left', return_indexers=False): """Join with another column. When the column is a index, set *return_indexers* to obtain the indices for shuffling the remaining columns. """ from cudf.dataframe.series import Series if not self.is_type_equivalent(other): raise TypeError('*other* is not compatible') lkey, largsort = self.sort_by_values(True) rkey, rargsort = other.sort_by_values(True) with _gdf.apply_join( [lkey], [rkey], how=how, method='sort') as (lidx, ridx): if lidx.size > 0: raw_index = cudautils.gather_joined_index( lkey.to_gpu_array(), rkey.to_gpu_array(), lidx, ridx, ) buf_index = Buffer(raw_index) else: buf_index = Buffer.null(dtype=self.dtype) joined_index = lkey.replace(data=buf_index) if return_indexers: def gather(idxrange, idx): mask = (Series(idx) != -1).as_mask() return idxrange.take(idx).set_mask(mask).fillna(-1) if len(joined_index) > 0: indexers = ( gather(Series(largsort), lidx), gather(Series(rargsort), ridx), ) else: indexers = ( Series(Buffer.null(dtype=np.intp)), Series(Buffer.null(dtype=np.intp)) ) return joined_index, indexers else: return joined_index
def wrapper(*args, **kwargs): ret = getattr(self._parent._data, attr)(*args, **kwargs) if isinstance(ret, nvstrings.nvstrings): ret = Series( columnops.as_column(ret), index=self._index, name=self._parent.name, ) return ret
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)
def codes(self): from cudf.dataframe.series import Series data = self._parent.data if self._parent.has_null_mask: mask = self._parent.mask null_count = self._parent.null_count return Series.from_masked_array(data=data.mem, mask=mask.mem, null_count=null_count) else: return Series(data)
def _get_row_major(self, df, row_tuple): from cudf import Series valid_indices = self._get_valid_indices_by_tuple( df.index, row_tuple, len(df.index) ) indices = Series(valid_indices) result = df.take(indices) final = self._index_and_downcast(result, result.index, row_tuple) return final
def lower(self): """ Convert strings in the Series/Index to lowercase. Returns ------- Series/Index of str dtype A copy of the object with all strings converted to lowercase. """ from cudf.dataframe import Series return Series(self._parent.data.lower(), index=self._index)
def _group_dataframe(self, df, levels): """Group dataframe. The output dataframe has the same number of rows as the input dataframe. The rows are shuffled so that the groups are moved together in ascending order based on the multi-level index. Parameters ---------- df : DataFrame levels : list[str] Column names for the multi-level index. Returns ------- (df, segs) : namedtuple * df : DataFrame The grouped dataframe. * segs : Series. Group starting index. """ if len(df) == 0: # Groupby on empty dataframe return _dfsegs_pack(df=df, segs=Buffer(np.asarray([]))) # Prepare dataframe orig_df = df.copy() df = df.loc[:, levels].reset_index(drop=True) df = df.to_frame() if isinstance(df, Series) else df rowid_column = '__cudf.groupby.rowid' df[rowid_column] = df.index.as_column() col_order = list(levels) # Perform grouping df, segs, markers = self._group_first_level(col_order[0], rowid_column, df) rowidcol = df[rowid_column] sorted_keys = [Series(df.index.as_column())] del df more_keys, reordering_indices, segs = self._group_inner_levels( col_order[1:], rowidcol, segs, markers=markers) sorted_keys.extend(more_keys) valcols = [k for k in orig_df.columns if k not in levels] # Prepare output # All key columns are already sorted out_df = DataFrame() for k, sr in zip(levels, sorted_keys): out_df[k] = sr # Shuffle the value columns self._group_shuffle(orig_df.loc[:, valcols], reordering_indices, out_df) return _dfsegs_pack(df=out_df, segs=segs)
def __init__(self, levels=None, codes=None, labels=None, names=None, **kwargs): self.names = names column_names = [] if labels: warnings.warn("the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if 'source_data' in kwargs: self._source_data = kwargs['source_data'] self._codes = codes self._levels = levels self.names = self._source_data.columns return # name setup if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenNDArray, pd.core.indexes.frozen.FrozenList)): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError('Must pass non-zero number of levels/codes') import cudf if not isinstance(codes, cudf.dataframe.dataframe.DataFrame) and\ not isinstance(codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)): raise TypeError('Codes is not a Sequence of sequences') if not isinstance(codes, cudf.dataframe.dataframe.DataFrame): self._codes = cudf.dataframe.dataframe.DataFrame() for idx, code in enumerate(codes): code = np.array(code) self._codes.add_column(column_names[idx], columnops.as_column(code)) else: self._codes = codes # converting levels to numpy array will produce a Float64Index # (on empty levels)for levels mimicking the behavior of Pandas self._levels = np.array([Series(level).to_array() for level in levels]) self._validate_levels_and_codes(self._levels, self._codes) self.name = None self.names = names
def _hashjoin(self, other, how='left', return_indexers=False): from cudf.dataframe.series import Series if not self.is_type_equivalent(other): raise TypeError('*other* is not compatible') with _gdf.apply_join( [self], [other], how=how, method='hash') as (lidx, ridx): if lidx.size > 0: raw_index = cudautils.gather_joined_index( self.to_gpu_array(), other.to_gpu_array(), lidx, ridx, ) buf_index = Buffer(raw_index) else: buf_index = Buffer.null(dtype=self.dtype) joined_index = self.replace(data=buf_index) if return_indexers: def gather(idxrange, idx): mask = (Series(idx) != -1).as_mask() return idxrange.take(idx).set_mask(mask).fillna(-1) if len(joined_index) > 0: indexers = ( gather(Series(range(0, len(self))), lidx), gather(Series(range(0, len(other))), ridx), ) else: indexers = ( Series(Buffer.null(dtype=np.intp)), Series(Buffer.null(dtype=np.intp)) ) return joined_index, indexers else: return joined_index
def _group_inner_levels(self, columns, rowidcol, segs, markers): """Group the second and onwards level. Parameters ---------- columns : sequence[str] Group keys. The order is important. rowid_column : str The name of the special column with the original rowid. It's internally used to determine the shuffling order. df : DataFrame The dataframe being grouped. segs : Series First level group begin offsets. Returns ------- (sorted_keys, reordering_indices, segments) - sorted_keys : list[Series] List of sorted key columns. Column order is same as arg *columns*. - reordering_indices : device array The indices to gather on to shuffle the dataframe into the grouped seqence. - segments : Series Group begin offsets. """ dsegs = segs.astype(dtype=np.int32).data.mem sorted_keys = [] plan_cache = {} for col in columns: # Shuffle the key column according to the previous groups srkeys = self._df[col].take(rowidcol.to_gpu_array(), ignore_index=True) # Segmented sort on the key shuf = Column(Buffer(cudautils.arange(len(srkeys)))) cache_key = (len(srkeys), srkeys.dtype, shuf.dtype) plan = plan_cache.get(cache_key) plan = apply_segsort(srkeys._column, shuf, dsegs, plan=plan) plan_cache[cache_key] = plan sorted_keys.append(srkeys) # keep sorted key cols # Determine segments dsegs, markers = cudautils.find_segments(srkeys.to_gpu_array(), dsegs, markers=markers) # Shuffle rowidcol = rowidcol.take(shuf.to_gpu_array(), ignore_index=True) reordering_indices = rowidcol.to_gpu_array() return sorted_keys, reordering_indices, Series(dsegs)
def _to_frame(self): from cudf import DataFrame # for each column of codes # replace column with mapping from integers to levels df = self.codes.copy(deep=False) for idx, column in enumerate(df.columns): # use merge as a replace fn level = DataFrame({'idx': Series(cudautils.arange(len( self.levels[idx]), dtype=df[column].dtype)), 'level': self.levels[idx]}) code = DataFrame({'idx': df[column]}) df[column] = code.merge(level).level return df
def apply(self, method): gpu_out = numba.cuda.device_array_like(self.gpu_in) kernel = get_ewm_kernel(method) kernel[(self.number_of_blocks,), (self.number_of_threads,), 0, self.shared_buffer_size * 8](self.gpu_in, gpu_out, self.window, self.span, self.array_len, self.thread_tile, self.min_periods) return Series(gpu_out)
def _compute_levels_and_codes(self): levels = [] from cudf import DataFrame codes = DataFrame() names = [] # Note: This is an O(N^2) solution using gpu masking # to compute new codes for the MultiIndex. There may be # a faster solution that could be executed on gpu at the same # time the groupby is calculated. for by in self._source_data.columns: if len(self._source_data[by]) > 0: level = self._source_data[by].unique() replaced = self._source_data[by].replace( level, Series(range(len(level)))) else: level = np.array([]) replaced = np.array([]) levels.append(level) codes[by] = Series(replaced, dtype="int32") names.append(by) self._levels = levels self._codes = codes self.names = names
def cat(self, others=None, sep=None, na_rep=None): """ Concatenate strings in the Series/Index with given separator. If *others* is specified, this function concatenates the Series/Index and elements of others element-wise. If others is not passed, then all values in the Series/Index are concatenated into a single string with a given sep. Parameters ---------- others : Series or List of str Strings to be appended. The number of strings must match size() of this instance. This must be either a Series of string dtype or a Python list of strings. sep : str If specified, this separator will be appended to each string before appending the others. na_rep : str This character will take the place of any null strings (not empty strings) in either list. - If `na_rep` is None, and `others` is None, missing values in the Series/Index are omitted from the result. - If `na_rep` is None, and `others` is not None, a row containing a missing value in any of the columns (before concatenation) will have a missing value in the result. Returns ------- concat : str or Series/Index of str dtype If `others` is None, `str` is returned, otherwise a `Series/Index` (same type as caller) of str dtype is returned. """ from cudf.dataframe import Series, Index if isinstance(others, (Series, Index)): assert others.dtype == np.dtype('object') others = others.data out = Series(self._parent.data.cat(others=others, sep=sep, na_rep=na_rep), index=self._index) if len(out) == 1 and others is None: out = out[0] return out
def __getattr__(self, attr, *args, **kwargs): from cudf.dataframe.series import Series if hasattr(self._parent._data, attr): passed_attr = getattr(self._parent._data, attr) if callable(passed_attr): def wrapper(*args, **kwargs): return getattr(self._parent._data, attr)(*args, **kwargs) if isinstance(wrapper, nvstrings.nvstrings): wrapper = Series(columnops.as_column(wrapper), index=self._index) return wrapper else: return passed_attr else: raise AttributeError(attr)
def from_dlpack(pycapsule_obj): """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_. This function takes a PyCapsule object which contains a pointer to a DLPack tensor as input, and returns a cuDF object. This function deep copies the data in the DLPack tensor into a cuDF object. Parameters ---------- pycapsule_obj : PyCapsule Input DLPack tensor pointer which is encapsulated in a PyCapsule object. Returns ------- A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D or 2D. """ try: res, valids = cpp_dlpack.from_dlpack(pycapsule_obj) except GDFError as err: if str(err) == "b'GDF_DATASET_EMPTY'": raise ValueError( "Cannot create a cuDF Object from a DLPack tensor of 0 size") else: raise err cols = [] for idx in range(len(valids)): mask = None if valids[idx]: mask = Buffer(valids[idx]) cols.append( columnops.build_column(Buffer(res[idx]), dtype=res[idx].dtype, mask=mask)) if len(cols) == 1: return Series(cols[0]) else: df = DataFrame() for idx, col in enumerate(cols): df[idx] = col return df
def extract(self, pat, flags=0, expand=True): """ Extract capture groups in the regex `pat` as columns in a DataFrame. For each subject string in the Series, extract groups from the first match of regular expression `pat`. Parameters ---------- pat : str Regular expression pattern with capturing groups. expand : bool, default True If True, return DataFrame with on column per capture group. If False, return a Series/Index if there is one capture group or DataFrame if there are multiple capture groups. Returns ------- DataFrame or Series/Index A DataFrame with one row for each subject string, and one column for each group. If `expand=False` and `pat` has only one capture group, then return a Series/Index. Notes ----- The `flags` parameter is not yet supported and will raise a NotImplementedError if anything other than the default value is passed. """ if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") from cudf.dataframe import DataFrame, Series out = self._parent.data.extract(pat) if len(out) == 1 and expand is False: return Series( out[0], index=self._index ) else: out_df = DataFrame(index=self._index) for idx, val in enumerate(out): out_df[idx] = val return out_df
def searchsorted(self, value, side="left"): """Find indices where elements should be inserted to maintain order Parameters ---------- value : Column Column of values to search for side : str {‘left’, ‘right’} optional If ‘left’, the index of the first suitable location found is given. If ‘right’, return the last such index Returns ------- An index series of insertion points with the same shape as value """ from cudf.dataframe.series import Series idx_series = Series(self, name=self.name) result = idx_series.searchsorted(value, side) return as_index(result)
def len(self): """ Computes the length of each element in the Series/Index. Returns ------- Series or Index of int: A Series or Index of integer values indicating the length of each element in the Series or Index. """ from cudf.dataframe.series import Series out_dev_arr = rmm.device_array(len(self._parent), dtype='int32') ptr = get_ctype_ptr(out_dev_arr) self._parent.data.len(ptr) mask = None if self._parent.null_count > 0: mask = self._parent.mask column = columnops.build_column(Buffer(out_dev_arr), np.dtype('int32'), mask=mask) return Series(column, index=self._index)
def _group_first_level(self, col, rowid_column, df): """Group first level *col* of *df* Parameters ---------- col : str Name of the first group key column. df : DataFrame The dataframe being grouped. Returns ------- (df, segs) - df : DataFrame Sorted by *col- * index - segs : Series Group begin offsets """ df = df.loc[:, [col, rowid_column]] df = df.set_index(col).sort_index() segs, markers = df.index._find_segments() return df, Series(segs), markers
def read_csv_strings(filepath_or_buffer, lineterminator='\n', quotechar='"', quoting=True, doublequote=True, sep=',', delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression='infer', thousands=None, decimal='.', true_values=None, false_values=None, nrows=None): """ **Experimental**: This function exists only as a beta way to use `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. with cudf. Future versions of cuDF will provide cleaner integration. Uses mostly same arguments as read_csv. Note: Doesn't currently support auto-column detection, header, usecols and mangle_dupe_cols args. Returns ------- columns : ordered list of cudf.dataframe.Series and nvstrings objects numeric or date dtyped columns will be Series. 'str' dtyped columns will be `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. Examples -------- .. code-block:: python import cudf # Create a test csv file filename = 'foo.csv' lines = [ "num1,datetime,text", "123,2018-11-13T12:00:00,abc", "456,2018-11-14T12:35:01,def", "789,2018-11-15T18:02:59,ghi" ] with open(filename, 'w') as fp: fp.write('\\n'.join(lines)+'\\n') # Read the file with cudf names = ['num1', 'datetime', 'text'] dtypes = ['int', 'date', 'str'] columns = cudf.io.csv.read_csv_strings(filename, delimiter=',', names=names, dtype=dtypes, skiprows=1) # Display results columns[0] print(columns[0]) columns[2] print(columns[2]) Output: .. code-block:: python <cudf.Series nrows=3 > 0 123 1 456 2 789 <nvstrings count=3> ['abc', 'def', 'ghi'] See Also -------- .read_csv """ import nvstrings from cudf.dataframe.series import Series if names is None or dtype is None: msg = '''Automatic dtype detection not implemented: Column names and dtypes must be specified.''' raise TypeError(msg) # Alias sep -> delimiter. if delimiter is None: delimiter = sep if isinstance(dtype, dict): dtype_dict = True elif isinstance(dtype, list): dtype_dict = False if len(dtype) != len(names): msg = '''All column dtypes must be specified.''' raise TypeError(msg) else: msg = '''dtype must be 'list' or 'dict' ''' raise TypeError(msg) csv_reader = ffi.new('csv_read_arg*') # Populate csv_reader struct if is_file_like(filepath_or_buffer): buffer = filepath_or_buffer.read() # check if StringIO is used if hasattr(buffer, 'encode'): buffer_as_bytes = buffer.encode() else: buffer_as_bytes = buffer buffer_data_holder = ffi.new("char[]", buffer_as_bytes) csv_reader.input_data_form = libgdf.HOST_BUFFER csv_reader.filepath_or_buffer = buffer_data_holder csv_reader.buffer_size = len(buffer_as_bytes) else: file_path = _wrap_string(filepath_or_buffer) csv_reader.input_data_form = libgdf.FILE_PATH csv_reader.filepath_or_buffer = file_path arr_names = [] arr_dtypes = [] for col_name in names: arr_names.append(_wrap_string(col_name)) if dtype_dict: arr_dtypes.append(_wrap_string(str(dtype[col_name]))) names_ptr = ffi.new('char*[]', arr_names) csv_reader.names = names_ptr if not dtype_dict: for col_dtype in dtype: arr_dtypes.append(_wrap_string(str(col_dtype))) dtype_ptr = ffi.new('char*[]', arr_dtypes) csv_reader.dtype = dtype_ptr if decimal == delimiter: raise ValueError("decimal cannot be the same as delimiter") if thousands == delimiter: raise ValueError("thousands cannot be the same as delimiter") if nrows is not None and skipfooter != 0: raise ValueError("cannot use both nrows and skipfooter parameters") # Start with default values recognized as boolean arr_true_values = [_wrap_string(str('True')), _wrap_string(str('TRUE'))] arr_false_values = [_wrap_string(str('False')), _wrap_string(str('FALSE'))] for value in true_values or []: arr_true_values.append(_wrap_string(str(value))) arr_true_values_ptr = ffi.new('char*[]', arr_true_values) csv_reader.true_values = arr_true_values_ptr csv_reader.num_true_values = len(arr_true_values) for value in false_values or []: arr_false_values.append(_wrap_string(str(value))) false_values_ptr = ffi.new('char*[]', arr_false_values) csv_reader.false_values = false_values_ptr csv_reader.num_false_values = len(arr_false_values) compression_bytes = _wrap_string(compression) csv_reader.delimiter = delimiter.encode() csv_reader.lineterminator = lineterminator.encode() csv_reader.quotechar = quotechar.encode() csv_reader.quoting = quoting csv_reader.doublequote = doublequote csv_reader.delim_whitespace = delim_whitespace csv_reader.skipinitialspace = skipinitialspace csv_reader.dayfirst = dayfirst csv_reader.num_cols = len(names) csv_reader.skiprows = skiprows csv_reader.skipfooter = skipfooter csv_reader.compression = compression_bytes csv_reader.decimal = decimal.encode() csv_reader.thousands = thousands.encode() if thousands else b'\0' csv_reader.nrows = nrows if nrows is not None else -1 # Call read_csv libgdf.read_csv(csv_reader) out = csv_reader.data if out == ffi.NULL: raise ValueError("Failed to parse CSV") # Extract parsed columns outcols = [] for i in range(csv_reader.num_cols_out): if out[i].dtype == libgdf.GDF_STRING: ptr = int(ffi.cast("uintptr_t", out[i].data)) outcols.append(nvstrings.bind_cpointer(ptr)) else: newcol = Column.from_cffi_view(out[i]) if(newcol.dtype == np.dtype('datetime64[ms]')): col = newcol.view(DatetimeColumn, dtype='datetime64[ms]') else: col = newcol.view(NumericalColumn, dtype=newcol.dtype) outcols.append(Series(col)) return outcols
def cat(self, others=None, sep=None, na_rep=None): """ Concatenate strings in the Series/Index with given separator. If *others* is specified, this function concatenates the Series/Index and elements of others element-wise. If others is not passed, then all values in the Series/Index are concatenated into a single string with a given sep. Parameters ---------- others : Series or List of str Strings to be appended. The number of strings must match size() of this instance. This must be either a Series of string dtype or a Python list of strings. sep : str If specified, this separator will be appended to each string before appending the others. na_rep : str This character will take the place of any null strings (not empty strings) in either list. - If `na_rep` is None, and `others` is None, missing values in the Series/Index are omitted from the result. - If `na_rep` is None, and `others` is not None, a row containing a missing value in any of the columns (before concatenation) will have a missing value in the result. Returns ------- concat : str or Series/Index of str dtype If `others` is None, `str` is returned, otherwise a `Series/Index` (same type as caller) of str dtype is returned. """ from cudf.dataframe import Series, Index if isinstance(others, (Series, Index)): ''' If others is just another Series/Index, great go ahead with concatenation ''' assert others.dtype == np.dtype('object') others = others.data elif utils.is_list_like(others) and others: ''' If others is a list-like object (in our case lists & tuples) just another Series/Index, great go ahead with concatenation. ''' ''' Picking first element and checking if it really adheres to list like conditions, if not we switch to next case Note: We have made a call not to iterate over the entire list as it could be more expensive if it was of very large size. Thus only doing a sanity check on just the first element of list. ''' first = others[0] if utils.is_list_like(first) or \ isinstance(first, (Series, Index, pd.Series, pd.Index)): ''' Internal elements in others list should also be list-like and not a regular string/byte ''' first = None for frame in others: if not isinstance(frame, (Series, Index)): ''' Make sure all inputs to .cat function call are of type nvstrings so creating a Series object. ''' frame = Series(frame, dtype='str') if (first is None): ''' extracting nvstrings pointer since `frame` is of type Series/Index and first isn't yet initialized. ''' first = frame.data else: assert frame.dtype == np.dtype('object') frame = frame.data first = first.cat(frame, sep=sep, na_rep=na_rep) others = first elif not utils.is_list_like(first): ''' Picking first element and checking if it really adheres to non-list like conditions. Note: We have made a call not to iterate over the entire list as it could be more expensive if it was of very large size. Thus only doing a sanity check on just the first element of list. ''' others = Series(others) others = others.data elif isinstance(others, (pd.Series, pd.Index)): others = Series(others) others = others.data out = Series(self._parent.data.cat(others=others, sep=sep, na_rep=na_rep), index=self._index) if len(out) == 1 and others is None: out = out[0] return out