def sort_index( self, axis=0, level=None, ascending=True, inplace=False, kind="quicksort", na_position="last", sort_remaining=True, ignore_index: bool = False, ): axis = self._get_axis_number(axis) if axis not in (0, ): raise err._unsupported_error("axis", axis) nlevels = self._raw_index.nlevels if nlevels == 1: # Pandas ignores level and sort_remaining for single-level indices, levels = [0] if level is None else util.to_list_if_scalar(level) # and it casts ascending to a boolean value... ascending = [bool(ascending)] else: if level is None: levels = list(range(nlevels)) # When level is None, Pandas crops the ascending list # to match its length to the number of levels... ascending = self._get_ascending(ascending, nlevels)[:nlevels] else: levels = util.to_list_if_scalar(level) levels = [ self._raw_index._get_level_number(lvl) for lvl in levels ] default_asc = bool(ascending) ascending = self._get_ascending(ascending, len(levels)) if len(ascending) != len(levels): raise ValueError( "level must have same length as ascending") # XXX: Pandas ignores sort_remaining for multi-level indices # (GH #24247), and always sorts the levels monotonically # before the actual sorting... # Here we do the right thing and hopefully Pandas fixes # its bug in the future. if sort_remaining: already_added = set(levels) for lvl in range(nlevels): if lvl not in already_added: levels.append(lvl) ascending.append(default_asc) new_frame = self._frame.sort_index( axis=axis, levels=levels, ascending=ascending, kind=kind, na_position=na_position, ignore_index=ignore_index, ) return self._create_or_update_frame(new_frame, inplace)
def set_index( self, keys, drop=True, append=False, inplace=False, verify_integrity=False, ): if inplace not in ( True, False, ): raise err._invalid_value_error("inplace", inplace) keys = util.to_list_if_scalar(keys) keys = [ Series(key) if not isinstance(key, (str, Series)) else key for key in keys ] frame = self._frame columns = self.columns missing = [] to_drop = [] to_set = [] names = [] if append: to_set = util.to_list_if_scalar(self._raw_index.column) names = util.to_list_if_scalar(self._raw_index.name) for key in keys: if not isinstance(key, Series): if key in columns: idxr = columns.get_indexer_for([key]) to_drop.extend(idxr) to_set.extend(self._frame.select_columns(idxr)) names.append(key) else: missing.append(key) else: new_len = len(key) old_len = len(self) if new_len != old_len: raise ValueError( f"Length mismatch: Expected {old_len} rows, " f"received array of length {new_len}") to_set.append(key._frame._columns[0]) names.append(key.name) if missing: raise KeyError(f"None of {missing} are in the columns") if drop: columns = columns.delete(to_drop) frame = frame.drop_columns(to_drop) frame = frame.set_index(to_set, names) return DataFrame(frame=frame, columns=columns)
def _prepare_columns(self): left = self._left right = self._right # Copy the lists of columns as we may update them in place right below left_columns = left._columns.copy() right_columns = right._columns.copy() if self._left_index or self._right_index: left_columns += util.to_list_if_scalar(left._index.column) right_columns += util.to_list_if_scalar(right._index.column) return (left_columns, right_columns)
def to_csv( self, path=None, sep=",", na_rep="", columns=None, header=True, index=True, line_terminator=None, chunksize=None, partition=False, column_names=None, ): columns = self._columns.copy() if index: columns = util.to_list_if_scalar(self._index.column) + columns column_names = ( util.to_list_if_scalar(self._index.name) + column_names ) column_names = [ na_rep if name is None else name for name in column_names ] if not partition: columns = [column.repartition(1) for column in columns] plan = Map(self._runtime, OpCode.TO_CSV) num_pieces = columns[0].num_pieces plan.add_scalar_arg(num_pieces, ty.uint32) plan.add_scalar_arg(chunksize, ty.uint32) plan.add_scalar_arg(partition, ty.bool) plan.add_scalar_arg(header, ty.bool) plan.add_scalar_arg(path, ty.string) plan.add_scalar_arg(sep, ty.string) plan.add_scalar_arg(na_rep, ty.string) plan.add_scalar_arg(line_terminator, ty.string) plan.add_scalar_arg(len(columns), ty.uint32) for column_name in column_names: plan.add_scalar_arg(column_name, ty.string) for column in columns: column.add_to_plan(plan, True) fm = plan.execute(columns[0].launch_domain) # Since we don't have a dependence mechanism to chain up tasks based on # their IO requirements, we need to block on these IO tasks so that # the effects are visible to the user upon the return of this function. fm.wait()
def _prepare_columns(self): input_columns = self._frame._columns.copy() if not self._ignore_index or self._sort_index: input_columns += util.to_list_if_scalar(self._frame._index.column) return input_columns
def _shuffle(self, key_indices): partitioner = HashPartitioner(self._runtime) num_columns = len(self._columns) inputs = self._columns.copy() if self._index.materialized: inputs.extend(util.to_list_if_scalar(self._index.column)) outputs = partitioner._hash_partition(inputs, key_indices) if not self._runtime.use_nccl: outputs = to_dense_columns(self._runtime, outputs) if self._index.materialized: result_index = create_index_from_columns( outputs[num_columns:], self._index.volume, util.to_list_if_not_none(self._index.name), ) outputs = outputs[:num_columns] else: result_index = create_range_index( outputs[0].storage, self._index.volume ) result = self.replace_columns(outputs, index=result_index) result.set_partition_keys(key_indices) return result
def droplevel(self, level): levels = util.to_list_if_scalar(level) levels = [self._get_level_number(lvl) for lvl in levels] if len(levels) >= self.nlevels: raise ValueError( f"Cannot remove {len(levels)} levels from an index with " f"{self.nlevels} levels: at least one level must be left.") return self._droplevel(levels)
def launch_future_task(self, op_code, futures, dtype=None): task = Task(self.get_task_id(op_code)) futures = to_list_if_scalar(futures) for future in futures: task.add_future(future) result = self.dispatch(task) if dtype is not None: result = result.cast(dtype) return result
def _generate_pandas_metadata( table, column_names, index, materialized ): pandas_schema = table.to_pandas(schema_only=True) pandas_schema.columns = column_names index_descs = [] if index is not False: if index is None and not materialized: index_descs = [ { "kind": "range", "name": table._index.name, "start": table._index.start, "stop": table._index.stop, "step": table._index.step, } ] else: index_descs = [ f"__index_level_{level}__" if name is None else name for level, name in enumerate( util.to_list_if_scalar(table._index.name) ) ] column_names = index_descs + column_names if isinstance(pandas_schema.index, pandas.MultiIndex): index_levels = pandas_schema.index.levels else: index_levels = util.to_list_if_scalar(pandas_schema.index) from pyarrow import pandas_compat metadata = pandas_compat.construct_metadata( pandas_schema, column_names, index_levels, index_descs, index is not False, [col.dtype.to_arrow() for col in table._columns], ) return metadata[str.encode("pandas")].decode(), index_descs
def _get_level_names(self, levels): names = util.to_list_if_scalar(self.name) names = [names[lvl] for lvl in levels] if any(name is None for name in names): if len(names) == 1: names[0] = "index" else: names = [ f"level_{lvl}" if name is None else name for lvl, name in zip(levels, names) ] return names
def _set_columns_by_labels(self, key, item): keys = util.to_list_if_scalar(key) columns = self.columns # Validate keys found = [] fresh = [] for key in keys: if key in columns: found.append(key) else: fresh.append(key) # TODO: for now we disallow insertions mixed with inplace updates if len(found) > 0 and len(fresh) > 0: raise err._unsupported_error( "In-place updates cannot be mixed with insertions. " "Please split them into multiple statements.") if not is_scalar(item): item = self._ensure_valid_frame(item) _, item = self._align_frame(item, join="left", axis=0) if item._is_series: if len(keys) > 1: raise err._unsupported_error( "Broadcasting a series to multiple columns is " "not yet supported") else: if len(keys) != len(item.columns): raise ValueError("Columns must be same length as key") if len(found) > 0: indexer = columns.get_indexer_for(found) if is_scalar(item): item = self._frame.create_column_from_scalar(item) item = item.broadcast(len(indexer)) else: item = item._frame self._frame.update_columns(indexer, item) else: if is_scalar(item): for _ in range(len(fresh)): idx = self._frame.num_columns() self._frame = self._frame.insert(idx, item) else: item = DataFrame(frame=item._frame, columns=fresh) self._frame = self._frame.concat(1, item._frame) self._replace_columns(columns.append(pandas.Index(fresh)))
def convert_agg_func(agg_func): if isinstance(agg_func, str): if agg_func not in _SUPPORTED_AGGS: raise err._unsupported_error( f"Unsupported aggregation method: {agg_func}") return (agg_func, _NUMERIC_ONLY[agg_func]) elif is_dict_like(agg_func): converted = {} for col, func in agg_func.items(): funcs = util.to_list_if_scalar(convert_agg_func(func)) converted[col] = funcs return converted elif is_list_like(agg_func): return [convert_agg_func(func) for func in agg_func] else: raise err._unsupported_error( f"Unsupported aggregation descriptor: {agg_func}")
def slice_rows_by_slice(self, sl, is_loc=True, bounds=None): if bounds is None: bounds = self._index.find_bounds(sl.start, sl.stop, is_loc) rt = self._runtime storage = rt.create_output_storage() inputs = self._columns.copy() if self._index.materialized: inputs += util.to_list_if_scalar(self._index.column) outputs = [storage.create_similar_column(input) for input in inputs] if len(outputs) > 0: plan = Map(rt, OpCode.SLICE_BY_RANGE) plan.add_future(bounds) plan.add_scalar_arg(len(inputs), ty.uint32) plan.add_future(self._index.volume) for input, output in zip(inputs, outputs): input.add_to_plan(plan, True) output.add_to_plan_output_only(plan) counts = plan.execute(inputs[0].launch_domain) storage = plan.promote_output_storage(storage) self._runtime.register_external_weighted_partition( storage.default_ipart, counts ) volume = counts.cast(ty.int64).sum() if self._index.materialized: result_index = create_index_from_columns( outputs[len(self._columns) :], volume, self._index.names ) else: result_index = self._index.slice_by_bounds(bounds, storage) return self.replace_columns( outputs[: len(self._columns)], index=result_index ) else: result_index = self._index.slice_by_bounds(bounds) return self.replace_columns([], index=result_index) return self
def _get_columns_by_labels(self, key): key_scalar = is_scalar(key) or isinstance(key, tuple) keys = util.to_list_if_scalar(key) columns = self.columns # Validate keys for key in keys: if key not in columns: raise KeyError(key) indexer = columns.get_indexer_for(keys) new_self = self._slice_columns(indexer) if key_scalar: assert len(new_self.columns) == 1 return new_self.squeeze(axis=1) else: return new_self
def reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=""): if inplace not in ( True, False, ): raise err._invalid_value_error("inplace", inplace) if drop not in ( True, False, ): raise err._invalid_value_error("drop", drop) if level is None: levels = list(range(self._raw_index.nlevels)) else: levels = util.to_list_if_scalar(level) levels = [self._raw_index._get_level_number(lvl) for lvl in levels] # Pandas seems to ignore the order in which the levels are specified # but rather sorts them levels = sorted(levels) frame = self._frame.reset_index(levels, drop) columns = self.columns # FIXME: For now we will ignore the corner case where a column # named index or level_0 already exists. if not drop: names = self._raw_index._get_level_names(levels) lev_num = columns._get_level_number(col_level) if isinstance(columns, pandas.MultiIndex): arrays = [[col_fill] * len(names)] * columns.nlevels arrays[lev_num] = names names = pandas.MultiIndex.from_arrays(arrays) else: names = pandas.Index(names) columns = names.append(columns) return self._create_or_update_frame(frame, inplace, columns=columns)
def sort_values( self, by, axis=0, ascending=True, inplace: bool = False, kind="quicksort", na_position="last", ignore_index: bool = False, ): axis = self._get_axis_number(axis) if axis not in (0, ): raise err._unsupported_error("axis", axis) if na_position not in ( "first", "last", ): raise err._invalid_value_error("na_position", na_position) by = util.to_list_if_scalar(by) ascending = self._get_ascending(ascending, len(by)) if len(by) != len(ascending): raise ValueError(f"Length of ascending ({len(ascending)}) != " f"length of by ({len(by)})") idxr = self.columns.get_indexer_for(by) if len(idxr) != len(by): for key in by: if len(by.count(key)) > 1: raise ValueError("The column label '{key}' is not unique.") new_frame = self._frame.sort_values( idxr, axis, ascending, kind, na_position, ignore_index, ) return self._create_or_update_frame(new_frame, inplace)
def drop_duplicates( self, subset=None, keep="first", inplace=False, ignore_index=False, ): if subset is None: subset = list(range(len(self.columns))) else: subset = util.to_list_if_scalar(subset) idxr = self.columns.get_indexer_for(subset) mask = idxr == -1 if mask.any(): raise KeyError(list(np.compress(mask, subset))) subset = idxr if keep not in ("first", "last", False): raise ValueError("keep must be either 'first', 'last' or False") frame = self._frame.drop_duplicates(subset, keep, ignore_index) return self._create_or_update_frame(frame, inplace)
def to_csv( self, path_or_buf=None, sep=",", na_rep="", columns=None, header=True, index=True, line_terminator=None, chunksize=None, partition=False, ): if not isinstance(path_or_buf, str): raise err._unsupported_error("path must be a string for now") if len(sep) != 1: raise err._unsupported_error("separator must be a character") line_terminator = (os.linesep if line_terminator is None else line_terminator) # The default chunk size is 8 chunksize = 8 if chunksize is None else chunksize new_self = self if columns is not None: new_self = self[util.to_list_if_scalar(columns)] new_self._frame.to_csv( path=path_or_buf, sep=sep, na_rep=na_rep, header=header, index=index, line_terminator=line_terminator, chunksize=chunksize, partition=partition, column_names=new_self.columns.to_list(), )
def reset_index(self, level=None, drop=False, name=None, inplace=False): if inplace not in ( True, False, ): raise err._invalid_value_error("inplace", inplace) if drop not in ( True, False, ): raise err._invalid_value_error("drop", drop) if level is None: levels = list(range(self._raw_index.nlevels)) else: levels = util.to_list_if_scalar(level) levels = [self._raw_index._get_level_number(lvl) for lvl in levels] # Pandas seems to ignore the order in which the levels are specified # but rather sorts them levels = sorted(levels) frame = self._frame.reset_index(levels, drop) if inplace and len(frame._columns) > 1: raise TypeError( "Cannot reset_index inplace on a Series to create a DataFrame") if drop: return self._create_or_update_frame(frame, inplace) if name is None: name = 0 if self.name is None else self.name names = self._raw_index._get_level_names(levels) + [name] columns = pandas.Index(names) from .dataframe import DataFrame return DataFrame(columns=columns, frame=frame)
def to_parquet( self, path, column_names, engine="auto", compression="snappy", index=None, partition_cols=None, **kwargs, ): token = self._create_directory(path) def _generate_pandas_metadata( table, column_names, index, materialized ): pandas_schema = table.to_pandas(schema_only=True) pandas_schema.columns = column_names index_descs = [] if index is not False: if index is None and not materialized: index_descs = [ { "kind": "range", "name": table._index.name, "start": table._index.start, "stop": table._index.stop, "step": table._index.step, } ] else: index_descs = [ f"__index_level_{level}__" if name is None else name for level, name in enumerate( util.to_list_if_scalar(table._index.name) ) ] column_names = index_descs + column_names if isinstance(pandas_schema.index, pandas.MultiIndex): index_levels = pandas_schema.index.levels else: index_levels = util.to_list_if_scalar(pandas_schema.index) from pyarrow import pandas_compat metadata = pandas_compat.construct_metadata( pandas_schema, column_names, index_levels, index_descs, index is not False, [col.dtype.to_arrow() for col in table._columns], ) return metadata[str.encode("pandas")].decode(), index_descs materialized = self._index.materialized metadata, index_descs = _generate_pandas_metadata( self, column_names, index, materialized ) columns = self._columns if index or (index is not False and materialized): columns = util.to_list_if_scalar(self._index.column) + columns column_names = index_descs + column_names assert len(columns) == len(column_names) compression = self._runtime.get_compression_type(compression) plan = Map(self._runtime, OpCode.TO_PARQUET) num_pieces = self._columns[0].num_pieces plan.add_future(token) plan.add_scalar_arg(num_pieces, ty.uint32) plan.add_scalar_arg(compression, ty.uint32) plan.add_scalar_arg(path, ty.string) plan.add_scalar_arg(metadata, ty.string) plan.add_scalar_arg(len(columns), ty.uint32) for column_name in column_names: plan.add_scalar_arg(column_name, ty.string) for column in columns: column.add_to_plan(plan, True) fm = plan.execute(self._columns[0].launch_domain) # TODO: Once we move the metadata generation to a Python task, # we can avoid blocking here and instead chain the task # to it. fm.wait() # TODO: We wlil move this post processing to a Python task and # get rid of the use of shard id here. if self._runtime._this_is_first_node(): import pyarrow.parquet as pq metadata = None num_digits = int(log10(num_pieces)) + 1 for idx in range(num_pieces): part = f"part%0{num_digits}d.parquet" % idx md = pq.ParquetFile(os.path.sep.join([path, part])).metadata md.set_file_path(part) if metadata is None: metadata = md else: metadata.append_row_groups(md) metadata.write_metadata_file(os.path.sep.join([path, "_metadata"]))
def concat(self, axis, others, **kwargs): others = util.to_list_if_scalar(others) if axis == 1: columns = self._columns.copy() for other in others: columns.extend(other._columns) if len(self._columns) == 0: return Table(self._runtime, others[0]._index, columns) else: return Table( self._runtime, self._index, columns, ) else: assert axis == 0 dfs = [self] + others num_dfs = len(dfs) result_storage = self._runtime.create_output_storage() partition_keys = self.partition_keys # FIXME: Here we assumed that dataframes have the same # set of columns. When an input dataframe does # not have a column that any of the inputs do, # it is implicitly extended with a column of nulls # while being concatenated. index_dtypes = util.to_list_if_scalar(self._index.dtype) value_dtypes = util.get_dtypes(self._columns) num_levels = len(index_dtypes) num_values = len(value_dtypes) all_index_columns = [] all_value_columns = [] # Hert the access to the internal _column memeber # of the self's index is intentional, as we want to # avoid materializing the index unnecessarily num_pieces = self._index._column.num_pieces for df in dfs: index_columns = util.to_list_if_scalar(df._index.column) all_index_columns.append( [ column.repartition(num_pieces) for column in index_columns ] ) all_value_columns.append( [ df._columns[i].repartition(num_pieces) for i in range(num_values) ] ) partition_keys = self.join_partition_keys( partition_keys, df.partition_keys ) nullable_index = [ any(columns[i].nullable for columns in all_index_columns) for i in range(num_levels) ] nullable_value = [ any(columns[i].nullable for columns in all_value_columns) for i in range(num_values) ] result_index_columns = result_storage.create_columns( index_dtypes, nullable=nullable_index ) result_value_columns = result_storage.create_columns( value_dtypes, nullable=nullable_value ) plan = Map(self._runtime, OpCode.CONCATENATE) plan.add_scalar_arg(num_levels + num_values, ty.uint32) for column in result_index_columns: column.add_to_plan_output_only(plan) for column in result_value_columns: column.add_to_plan_output_only(plan) plan.add_scalar_arg(num_dfs, ty.uint32) for i in range(num_dfs): for column in all_index_columns[i]: column.add_to_plan(plan, True) for column in all_value_columns[i]: column.add_to_plan(plan, True) launch_domain = self._index._column.launch_domain counts = plan.execute(launch_domain) result_storage = plan.promote_output_storage(result_storage) self._runtime.register_external_weighted_partition( result_storage.default_ipart, counts ) del plan index_names = util.to_list_if_scalar(self._index.name) total_count = counts.cast(ty.int64).sum() result_index = create_index_from_columns( result_index_columns, total_count, index_names ) result = Table( self._runtime, result_index, result_value_columns, ) result.set_partition_keys(partition_keys) return result
def _shuffle(self, keys): keys = util.to_list_if_scalar(keys) idxr = self.columns.get_indexer_for(keys) return self._create_or_update_frame(self._frame._shuffle(idxr), False)
def dropna(self, axis, idxr, thresh): assert axis == 0 assert idxr is not None result_storage = self._runtime.create_output_storage() result_columns = [] result_index_columns = [] plan = Map(self._runtime, OpCode.DROPNA) plan.add_scalar_arg(thresh, ty.uint32) plan.add_scalar_arg(len(idxr), ty.uint32) for idx in idxr: plan.add_scalar_arg(idx, ty.int32) num_columns = len(self._columns) plan.add_scalar_arg(num_columns, ty.uint32) for i in range(num_columns): input = self._columns[i] output = result_storage.create_similar_column(input) result_columns.append(output) input.add_to_plan(plan, True) output.add_to_plan_output_only(plan) index_dtypes = util.to_list_if_scalar(self._index.dtype) plan.add_scalar_arg(len(index_dtypes), ty.uint32) input_index_materialized = self._index.materialized plan.add_scalar_arg(input_index_materialized, ty.bool) if input_index_materialized: input_index_columns = util.to_list_if_scalar(self._index.column) for input, index_dtype in zip(input_index_columns, index_dtypes): output = result_storage.create_column( index_dtype, nullable=input.nullable ) result_index_columns.append(output) input.add_to_plan(plan, True) output.add_to_plan_output_only(plan) else: plan.add_future(self._index._start) plan.add_future(self._index._step) for index_dtype in index_dtypes: output = result_storage.create_column( index_dtype, nullable=False ) output.add_to_plan_output_only(plan) result_index_columns.append(output) counts = plan.execute(self._columns[0].launch_domain) volume = counts.cast(ty.int64).sum() result_storage = plan.promote_output_storage(result_storage) self._runtime.register_external_weighted_partition( result_storage.default_ipart, counts ) del plan result_index = create_index_from_columns( result_index_columns, volume, self._index.names ) return self.replace_columns(result_columns, index=result_index)
def read_parquet(path, columns, **kwargs): from legate.core import Rect from .runtime import _runtime as rt path = util.to_list_if_scalar(path) if len(path) == 1 and os.path.isdir(path[0]): from pyarrow.parquet import ParquetDataset ds = ParquetDataset(path) path = [piece.path for piece in ds.pieces] else: from pyarrow.parquet import ParquetFile ds = ParquetFile(path[0]) if rt.debug: assert all(ParquetFile(p).schema == ds.schema for p in path) dedup_names = set() for name in ds.schema.names: if name in dedup_names: raise ValueError( "Duplicate column names in schema are not supported.") dedup_names.add(name) schema = ds.schema.to_arrow_schema() index_descs = [] index_materialized = False if str.encode("pandas") in ds.metadata.metadata: import json pandas_metadata = json.loads( ds.metadata.metadata[str.encode("pandas")]) index_descs = pandas_metadata["index_columns"] index_materialized = len(index_descs) > 0 and all( isinstance(desc, str) for desc in index_descs) if columns is None: column_names = schema.names elif index_materialized: column_names = columns + index_descs else: column_names = columns for name in column_names: if name not in dedup_names: raise ValueError("Field named %s not found in the schema." % name) schema = [schema.field(name) for name in column_names] del columns storage = rt.create_output_storage() offsets_storage = None columns = [] for column_info in schema: dtype = ty.to_legate_dtype(column_info.type) column = storage.create_column(dtype) if ty.is_string_dtype(dtype): if offsets_storage is None: offsets_storage = rt.create_output_storage() offsets_column = offsets_storage.create_column(ty.int32, nullable=False) chars_storage = rt.create_output_storage() char_column = chars_storage.create_column(ty.int8, nullable=False) column.add_child(offsets_column) column.add_child(char_column) column = column.as_string_column() columns.append(column) plan = Map(rt, OpCode.READ_PARQUET) plan.add_scalar_arg(len(path), ty.uint32) for f in path: plan.add_scalar_arg(f, ty.string) plan.add_scalar_arg(len(column_names), ty.uint32) for name in column_names: plan.add_scalar_arg(name, ty.string) plan.add_scalar_arg(len(columns), ty.uint32) for column in columns: column.add_to_plan_output_only(plan) counts = plan.execute(Rect([rt.num_pieces])) storage = plan.promote_output_storage(storage) rt.register_external_weighted_partition(storage.default_ipart, counts) del plan size = counts.cast(ty.int64).sum() if index_materialized: to_filter = set(index_descs) index_columns = [] value_columns = [] value_column_names = [] for idx, name in enumerate(column_names): if name in to_filter: index_columns.append(columns[idx]) else: value_columns.append(columns[idx]) value_column_names.append(column_names[idx]) sanitized_names = [ None if name == f"__index_level_{level}__" else name for level, name in enumerate(index_descs) ] index = create_index_from_columns(index_columns, size, sanitized_names) else: value_columns = columns value_column_names = column_names if len(index_descs) > 0: assert len(index_descs) == 1 index_desc = index_descs[0] name = index_desc["name"] start = rt.create_future(index_desc["start"], ty.int64) stop = rt.create_future(index_desc["stop"], ty.int64) step = rt.create_future(index_desc["step"], ty.int64) index = create_range_index(storage, size, name, start, stop, step) else: index = create_range_index(storage, size) from pandas import Index return { "frame": Table(rt, index, value_columns), "columns": Index(value_column_names), }
def read_csv( filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, prefix=None, mangle_dupe_cols=True, dtype=None, true_values=None, false_values=None, skiprows=None, skipfooter=0, nrows=None, na_values=None, skip_blank_lines=True, parse_dates=False, compression="infer", quotechar='"', quoting=0, doublequote=True, verify_header=False, **kwargs, # TODO: Put back these options once we figure out how to support them # with the Arrows CSV reader. # skipinitialspace=False, # GPU only # keep_default_na=True, # GPU only # na_filter=True, # GPU only # dayfirst=False, # GPU only # thousands=None, # GPU only # decimal=".", # GPU only # lineterminator=None, # GPU only # comment=None, # GPU only # delim_whitespace=False, # GPU only ): # Checks on filepath_or_buffer paths = util.to_list_if_scalar(filepath_or_buffer) if any(not isinstance(path, str) for path in paths): raise err._unsupported_error( "'filepath_or_buffer' must be a string or a list of strings") if len(paths) == 0: raise ValueError("'filepath_or_buffer' must be a non-empty list") for path in paths: if not os.path.exists(path): raise ValueError(f"{path} does not exist") if not isinstance(compression, str): raise err._unsupported_error("compression", compression) compressions = [ _parse_compression(infer_compression(path, compression)) for path in paths ] # Checks on sep and delimiter if sep is None and delimiter is None: raise ValueError("at least one of 'sep' or 'delimiter' must be given") sep = delimiter if delimiter is not None else sep if len(sep) > 1: raise ValueError("'sep' must be a 1-character string") # Checks on sep and delimiter if header == "infer": header = 0 if names is None else None if header not in ( 0, None, ): raise err._unsupported_error("header", header) # Checks on skiprows, kipfooter, and nrows skiprows = 0 if skiprows is None else skiprows if not is_integer(skiprows): raise ValueError("'skiprows' must be an integer") if not is_integer(skipfooter): raise ValueError("'skipfooter' must be an integer") if not (nrows is None or is_integer(nrows)): raise ValueError("'nrows' must be None or an integer") # If either column names or dtype is missing, infer them by parsing # the first few of lines using Pandas # FIXME: We should use cuDF for this if names is None or dtype is None: engine = ("python" if skipfooter > 0 else "c", ) column_names, dtypes = _extract_header_using_pandas( paths[0], sep, header, names, dtype, true_values, false_values, skiprows, na_values, skip_blank_lines, parse_dates, compression, quotechar, quoting, doublequote, engine, peek_rows=3, ) if verify_header: for path in paths[1:]: result = _extract_header_using_pandas( path, sep, header, names, dtype, true_values, false_values, skiprows, na_values, skip_blank_lines, parse_dates, compression, quotechar, quoting, doublequote, engine, peek_rows=3, ) if not column_names.equals(result[0]): raise ValueError( f"{paths[0]} and {path} have different headers") else: column_names = pandas.Index(names) if is_dict_like(dtype): dtypes = [] for name in names: if name not in dtype: raise ValueError(f"'dtype' has no entry for '{name}'") dtypes.append(_ensure_dtype(dtype[name])) elif is_list_like(dtype): raise err._unsupported_error( "'dtype' must be a string, a dtype, or a dictionary") else: dtype = _ensure_dtype(dtype) dtypes = [dtype] * len(names) if column_names.has_duplicates: raise ValueError("Header must not have any duplicates") # Checks on unsupported options if prefix is not None: raise err._unsupported_error("prefix", prefix) if mangle_dupe_cols not in (True, ): raise err._unsupported_error("mangle_dupe_cols", mangle_dupe_cols) # If there was a header in the file, we should skip that line as well if header == 0: skiprows += 1 # Checks on parse_dates _ERR_MSG_PARSE_DATES = ( "'parse_dates' must be a list of integers or strings for now") if is_dict_like(parse_dates): raise err._unsupported_error(_ERR_MSG_PARSE_DATES) parse_dates = parse_dates if parse_dates is not False else [] if not is_list_like(parse_dates): raise err._unsupported_error(_ERR_MSG_PARSE_DATES) date_cols = _get_indexer(column_names, parse_dates, "parse_dates") # Override dtypes for the datetime columns for idx in date_cols: dtypes[idx] = ty.ts_ns # If a column is given a datetime dtype but not added to the parse_dates, # we should record it for idx, dtype in enumerate(dtypes): if idx not in parse_dates: parse_dates.append(idx) # Checks on quoting if quoting != 0: raise err._unsupported_error("quoting", quoting) if len(quotechar) > 1: raise ValueError("'quotechar' must be a 1-character string") # Checks on index_col index_col = None if index_col is False else index_col if index_col is not None: if is_integer(index_col) or isinstance(index_col, str): index_col = [index_col] if not is_list_like(index_col): raise err._unsupported_error("index_col", index_col) index_col = _get_indexer(column_names, index_col, "index_col") # Checks on true_values, false_values, and na_values _check_string_list(true_values, "true_values") _check_string_list(false_values, "false_values") _check_string_list(na_values, "na_values") # Checks on nrows if skipfooter != 0 and nrows is not None: raise ValueError("'skipfooter' not supported with 'nrows'") df = DataFrame( frame=io.read_csv( paths, sep=sep, usecols=usecols, dtypes=dtypes, true_values=true_values, false_values=false_values, skiprows=skiprows, skipfooter=skipfooter, nrows=nrows, na_values=na_values, skip_blank_lines=skip_blank_lines, date_cols=date_cols, compressions=compressions, quotechar=quotechar, quoting=quoting, doublequote=doublequote, ), columns=column_names, ) if index_col is not None: df = df.set_index(column_names[index_col]) # Make sure we reset the names for unnamed indices names = df._raw_index.names names = [ None if name.startswith("Unnamed") else name for name in names ] df._raw_index.names = names return df
def __init__(self, df, by, axis, level, as_index, sort, method, is_series_groupby): axis = df._get_axis_number(axis) if axis not in (0, ): raise err._unsupported_error("axis", axis) if by is None and level is None: raise TypeError("You have to supply one of 'by' and 'level'") self._df = df self._axis = axis self._as_index = as_index self._sort = sort self._method = method self._is_series_groupby = is_series_groupby if level is not None: levels = util.to_list_if_scalar(level) self._keys = [ df._raw_index._get_level_number(lvl) for lvl in levels ] # Reset the levels chosen as the groupby keys so that they # appear in the frame self._df = self._df.reset_index(self._keys) # The pushed-out index levels are now the first few columns # in the frame, so we should change the key indices to pick # them correctly as the groupby keys later # A technical note: reset_index internally sorts level # numbers before it pushes out the corresponding levels # to the dataframe. Therefore, we use argsort to compute # the positions of the columns that we later pick for indices. self._keys = [ p[0] for p in sorted(enumerate(self._keys), key=lambda p: p[1]) ] self._levels = self._keys else: if df._is_series: raise err._unsupported_error( f"{type(self._df).__name__} only supports level") keys = util.to_list_if_scalar(by) if all(not isinstance(key, str) for key in keys): raise err._unsupported_error( "groupby keys must be column names for now") idxr = [] columns = df._get_columns() for key in keys: idx = columns.get_indexer_for([key]) if len(idx) > 1: raise KeyError(f"ambiguous key name {key}") if idx[0] == -1: raise KeyError(key) idxr.extend(idx) self._keys = idxr self._levels = []
def select(self, mask): if isinstance(mask, Table): assert len(mask._columns) == 1 mask = mask._columns[0] if self._runtime.debug: assert isinstance(mask, Column) assert mask.dtype == ty.bool result_storage = self._runtime.create_output_storage() result_columns = [] result_index_columns = [] plan_compact = Map(self._runtime, OpCode.COMPACT) mask.add_to_plan(plan_compact, True) num_columns = len(self._columns) plan_compact.add_scalar_arg(num_columns, ty.uint32) for i in range(num_columns): input = self._columns[i] output = result_storage.create_similar_column(input) result_columns.append(output) input.add_to_plan(plan_compact, True) output.add_to_plan_output_only(plan_compact) index_dtypes = util.to_list_if_scalar(self._index.dtype) plan_compact.add_scalar_arg(len(index_dtypes), ty.uint32) input_index_materialized = self._index.materialized plan_compact.add_scalar_arg(input_index_materialized, ty.bool) if input_index_materialized: input_index_columns = util.to_list_if_scalar(self._index.column) for input, index_dtype in zip(input_index_columns, index_dtypes): output = result_storage.create_column( index_dtype, nullable=input.nullable ) result_index_columns.append(output) input.add_to_plan(plan_compact, True) output.add_to_plan_output_only(plan_compact) else: plan_compact.add_future(self._index._start) plan_compact.add_future(self._index._step) for index_dtype in index_dtypes: output = result_storage.create_column( index_dtype, nullable=False ) output.add_to_plan_output_only(plan_compact) result_index_columns.append(output) counts = plan_compact.execute(mask.launch_domain) volume = counts.cast(ty.int64).sum() result_storage = plan_compact.promote_output_storage(result_storage) self._runtime.register_external_weighted_partition( result_storage.default_ipart, counts ) del plan_compact result_index = create_index_from_columns( result_index_columns, volume, self._index.names ) return self.replace_columns(result_columns, index=result_index)
def drop( self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors="raise", ): # If 'labels' is set, we use 'axis' to determine the lookup axis if labels is not None: if index is not None or columns is not None: raise ValueError( "Cannot specify both 'labels' and 'index'/'columns'") axis = self._get_axis_number(axis) if axis == 0: row_labels = util.to_list_if_scalar(labels) row_level = level col_labels = [] col_level = None else: row_labels = [] row_level = None col_labels = util.to_list_if_scalar(labels) col_level = level # Otherwise, we use 'columns' and 'index' as lookup labels else: if not self._is_series and columns is not None: col_labels = util.to_list_if_scalar(columns) col_level = level if index is not None: row_labels = util.to_list_if_scalar(index) row_level = level def _validate_labels(index, labels, level, membership=True): for label in labels: if not util.is_tuple(label): continue if len(label) > index.nlevels: raise KeyError(f"Key length ({len(label)}) exceeds " f"index depth ({index.nlevels})") if not membership: return if level is not None: level = index._get_level_number(level) index = index.get_level_values(level) for label in labels: if label not in index: raise KeyError(label) new_self = self.copy(deep=False) # Drop columns first as that's easier if len(col_labels) > 0: assert not new_self._is_series _validate_labels(new_self.columns, col_labels, col_level) columns = new_self.columns.drop(col_labels, level) idxr = new_self.columns.get_indexer_for(columns) new_self = new_self._slice_columns(idxr) # Then drop rows using selection if len(row_labels) > 0: _validate_labels(new_self._raw_index, row_labels, row_level, False) if len(row_labels) > 1: raise err._unsupported_error("Label must be a scalar for now") row_label = row_labels[0] if level is not None and not is_scalar(row_label): raise ValueError("label must be a scalar when 'level' is set") if util.is_tuple(row_label) and len(row_label) == 0: raise ValueError("label must not be empty") mask = new_self._raw_index._get_drop_mask_for(row_label, level) new_frame = new_self._frame.select(mask) new_self._frame = new_frame if inplace: if self._is_series: self._update_frame(new_self._frame) else: self._update_frame(new_self._frame, columns=new_self.columns) else: return new_self