コード例 #1
0
    def sort_index(
        self,
        axis=0,
        level=None,
        ascending=True,
        inplace=False,
        kind="quicksort",
        na_position="last",
        sort_remaining=True,
        ignore_index: bool = False,
    ):
        axis = self._get_axis_number(axis)
        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        nlevels = self._raw_index.nlevels
        if nlevels == 1:
            # Pandas ignores level and sort_remaining for single-level indices,
            levels = [0] if level is None else util.to_list_if_scalar(level)
            # and it casts ascending to a boolean value...
            ascending = [bool(ascending)]
        else:
            if level is None:
                levels = list(range(nlevels))
                # When level is None, Pandas crops the ascending list
                # to match its length to the number of levels...
                ascending = self._get_ascending(ascending, nlevels)[:nlevels]
            else:
                levels = util.to_list_if_scalar(level)
                levels = [
                    self._raw_index._get_level_number(lvl) for lvl in levels
                ]
                default_asc = bool(ascending)
                ascending = self._get_ascending(ascending, len(levels))
                if len(ascending) != len(levels):
                    raise ValueError(
                        "level must have same length as ascending")
                # XXX: Pandas ignores sort_remaining for multi-level indices
                #      (GH #24247), and always sorts the levels monotonically
                #      before the actual sorting...
                #      Here we do the right thing and hopefully Pandas fixes
                #      its bug in the future.
                if sort_remaining:
                    already_added = set(levels)
                    for lvl in range(nlevels):
                        if lvl not in already_added:
                            levels.append(lvl)
                            ascending.append(default_asc)

        new_frame = self._frame.sort_index(
            axis=axis,
            levels=levels,
            ascending=ascending,
            kind=kind,
            na_position=na_position,
            ignore_index=ignore_index,
        )
        return self._create_or_update_frame(new_frame, inplace)
コード例 #2
0
ファイル: dataframe.py プロジェクト: nv-legate/legate.pandas
    def set_index(
        self,
        keys,
        drop=True,
        append=False,
        inplace=False,
        verify_integrity=False,
    ):
        if inplace not in (
                True,
                False,
        ):
            raise err._invalid_value_error("inplace", inplace)
        keys = util.to_list_if_scalar(keys)
        keys = [
            Series(key) if not isinstance(key, (str, Series)) else key
            for key in keys
        ]

        frame = self._frame
        columns = self.columns

        missing = []
        to_drop = []
        to_set = []
        names = []
        if append:
            to_set = util.to_list_if_scalar(self._raw_index.column)
            names = util.to_list_if_scalar(self._raw_index.name)

        for key in keys:
            if not isinstance(key, Series):
                if key in columns:
                    idxr = columns.get_indexer_for([key])
                    to_drop.extend(idxr)
                    to_set.extend(self._frame.select_columns(idxr))
                    names.append(key)
                else:
                    missing.append(key)
            else:
                new_len = len(key)
                old_len = len(self)
                if new_len != old_len:
                    raise ValueError(
                        f"Length mismatch: Expected {old_len} rows, "
                        f"received array of length {new_len}")
                to_set.append(key._frame._columns[0])
                names.append(key.name)

        if missing:
            raise KeyError(f"None of {missing} are in the columns")

        if drop:
            columns = columns.delete(to_drop)
            frame = frame.drop_columns(to_drop)

        frame = frame.set_index(to_set, names)
        return DataFrame(frame=frame, columns=columns)
コード例 #3
0
ファイル: merge.py プロジェクト: magnatelee/legate.pandas
    def _prepare_columns(self):
        left = self._left
        right = self._right

        # Copy the lists of columns as we may update them in place right below
        left_columns = left._columns.copy()
        right_columns = right._columns.copy()

        if self._left_index or self._right_index:
            left_columns += util.to_list_if_scalar(left._index.column)
            right_columns += util.to_list_if_scalar(right._index.column)

        return (left_columns, right_columns)
コード例 #4
0
ファイル: table.py プロジェクト: nv-legate/legate.pandas
    def to_csv(
        self,
        path=None,
        sep=",",
        na_rep="",
        columns=None,
        header=True,
        index=True,
        line_terminator=None,
        chunksize=None,
        partition=False,
        column_names=None,
    ):
        columns = self._columns.copy()

        if index:
            columns = util.to_list_if_scalar(self._index.column) + columns
            column_names = (
                util.to_list_if_scalar(self._index.name) + column_names
            )
            column_names = [
                na_rep if name is None else name for name in column_names
            ]

        if not partition:
            columns = [column.repartition(1) for column in columns]

        plan = Map(self._runtime, OpCode.TO_CSV)
        num_pieces = columns[0].num_pieces

        plan.add_scalar_arg(num_pieces, ty.uint32)
        plan.add_scalar_arg(chunksize, ty.uint32)
        plan.add_scalar_arg(partition, ty.bool)
        plan.add_scalar_arg(header, ty.bool)
        plan.add_scalar_arg(path, ty.string)
        plan.add_scalar_arg(sep, ty.string)
        plan.add_scalar_arg(na_rep, ty.string)
        plan.add_scalar_arg(line_terminator, ty.string)
        plan.add_scalar_arg(len(columns), ty.uint32)
        for column_name in column_names:
            plan.add_scalar_arg(column_name, ty.string)
        for column in columns:
            column.add_to_plan(plan, True)

        fm = plan.execute(columns[0].launch_domain)
        # Since we don't have a dependence mechanism to chain up tasks based on
        # their IO requirements, we need to block on these IO tasks so that
        # the effects are visible to the user upon the return of this function.
        fm.wait()
コード例 #5
0
ファイル: sort.py プロジェクト: nv-legate/legate.pandas
    def _prepare_columns(self):
        input_columns = self._frame._columns.copy()

        if not self._ignore_index or self._sort_index:
            input_columns += util.to_list_if_scalar(self._frame._index.column)

        return input_columns
コード例 #6
0
ファイル: table.py プロジェクト: nv-legate/legate.pandas
    def _shuffle(self, key_indices):
        partitioner = HashPartitioner(self._runtime)

        num_columns = len(self._columns)
        inputs = self._columns.copy()
        if self._index.materialized:
            inputs.extend(util.to_list_if_scalar(self._index.column))
        outputs = partitioner._hash_partition(inputs, key_indices)
        if not self._runtime.use_nccl:
            outputs = to_dense_columns(self._runtime, outputs)
        if self._index.materialized:
            result_index = create_index_from_columns(
                outputs[num_columns:],
                self._index.volume,
                util.to_list_if_not_none(self._index.name),
            )
            outputs = outputs[:num_columns]
        else:
            result_index = create_range_index(
                outputs[0].storage, self._index.volume
            )

        result = self.replace_columns(outputs, index=result_index)
        result.set_partition_keys(key_indices)
        return result
コード例 #7
0
 def droplevel(self, level):
     levels = util.to_list_if_scalar(level)
     levels = [self._get_level_number(lvl) for lvl in levels]
     if len(levels) >= self.nlevels:
         raise ValueError(
             f"Cannot remove {len(levels)} levels from an index with "
             f"{self.nlevels} levels: at least one level must be left.")
     return self._droplevel(levels)
コード例 #8
0
 def launch_future_task(self, op_code, futures, dtype=None):
     task = Task(self.get_task_id(op_code))
     futures = to_list_if_scalar(futures)
     for future in futures:
         task.add_future(future)
     result = self.dispatch(task)
     if dtype is not None:
         result = result.cast(dtype)
     return result
コード例 #9
0
ファイル: table.py プロジェクト: nv-legate/legate.pandas
        def _generate_pandas_metadata(
            table, column_names, index, materialized
        ):
            pandas_schema = table.to_pandas(schema_only=True)
            pandas_schema.columns = column_names

            index_descs = []
            if index is not False:
                if index is None and not materialized:
                    index_descs = [
                        {
                            "kind": "range",
                            "name": table._index.name,
                            "start": table._index.start,
                            "stop": table._index.stop,
                            "step": table._index.step,
                        }
                    ]
                else:
                    index_descs = [
                        f"__index_level_{level}__" if name is None else name
                        for level, name in enumerate(
                            util.to_list_if_scalar(table._index.name)
                        )
                    ]
                    column_names = index_descs + column_names

            if isinstance(pandas_schema.index, pandas.MultiIndex):
                index_levels = pandas_schema.index.levels
            else:
                index_levels = util.to_list_if_scalar(pandas_schema.index)

            from pyarrow import pandas_compat

            metadata = pandas_compat.construct_metadata(
                pandas_schema,
                column_names,
                index_levels,
                index_descs,
                index is not False,
                [col.dtype.to_arrow() for col in table._columns],
            )
            return metadata[str.encode("pandas")].decode(), index_descs
コード例 #10
0
    def _get_level_names(self, levels):
        names = util.to_list_if_scalar(self.name)
        names = [names[lvl] for lvl in levels]

        if any(name is None for name in names):
            if len(names) == 1:
                names[0] = "index"
            else:
                names = [
                    f"level_{lvl}" if name is None else name
                    for lvl, name in zip(levels, names)
                ]
        return names
コード例 #11
0
ファイル: dataframe.py プロジェクト: nv-legate/legate.pandas
    def _set_columns_by_labels(self, key, item):
        keys = util.to_list_if_scalar(key)
        columns = self.columns

        # Validate keys
        found = []
        fresh = []
        for key in keys:
            if key in columns:
                found.append(key)
            else:
                fresh.append(key)

        # TODO: for now we disallow insertions mixed with inplace updates
        if len(found) > 0 and len(fresh) > 0:
            raise err._unsupported_error(
                "In-place updates cannot be mixed with insertions. "
                "Please split them into multiple statements.")

        if not is_scalar(item):
            item = self._ensure_valid_frame(item)
            _, item = self._align_frame(item, join="left", axis=0)

            if item._is_series:
                if len(keys) > 1:
                    raise err._unsupported_error(
                        "Broadcasting a series to multiple columns is "
                        "not yet supported")
            else:
                if len(keys) != len(item.columns):
                    raise ValueError("Columns must be same length as key")

        if len(found) > 0:
            indexer = columns.get_indexer_for(found)
            if is_scalar(item):
                item = self._frame.create_column_from_scalar(item)
                item = item.broadcast(len(indexer))
            else:
                item = item._frame

            self._frame.update_columns(indexer, item)

        else:
            if is_scalar(item):
                for _ in range(len(fresh)):
                    idx = self._frame.num_columns()
                    self._frame = self._frame.insert(idx, item)
            else:
                item = DataFrame(frame=item._frame, columns=fresh)
                self._frame = self._frame.concat(1, item._frame)
            self._replace_columns(columns.append(pandas.Index(fresh)))
コード例 #12
0
ファイル: reduction.py プロジェクト: nv-legate/legate.pandas
def convert_agg_func(agg_func):
    if isinstance(agg_func, str):
        if agg_func not in _SUPPORTED_AGGS:
            raise err._unsupported_error(
                f"Unsupported aggregation method: {agg_func}")
        return (agg_func, _NUMERIC_ONLY[agg_func])
    elif is_dict_like(agg_func):
        converted = {}
        for col, func in agg_func.items():
            funcs = util.to_list_if_scalar(convert_agg_func(func))
            converted[col] = funcs
        return converted
    elif is_list_like(agg_func):
        return [convert_agg_func(func) for func in agg_func]
    else:
        raise err._unsupported_error(
            f"Unsupported aggregation descriptor: {agg_func}")
コード例 #13
0
ファイル: table.py プロジェクト: nv-legate/legate.pandas
    def slice_rows_by_slice(self, sl, is_loc=True, bounds=None):
        if bounds is None:
            bounds = self._index.find_bounds(sl.start, sl.stop, is_loc)

        rt = self._runtime
        storage = rt.create_output_storage()

        inputs = self._columns.copy()
        if self._index.materialized:
            inputs += util.to_list_if_scalar(self._index.column)

        outputs = [storage.create_similar_column(input) for input in inputs]

        if len(outputs) > 0:
            plan = Map(rt, OpCode.SLICE_BY_RANGE)

            plan.add_future(bounds)
            plan.add_scalar_arg(len(inputs), ty.uint32)
            plan.add_future(self._index.volume)
            for input, output in zip(inputs, outputs):
                input.add_to_plan(plan, True)
                output.add_to_plan_output_only(plan)

            counts = plan.execute(inputs[0].launch_domain)

            storage = plan.promote_output_storage(storage)
            self._runtime.register_external_weighted_partition(
                storage.default_ipart, counts
            )

            volume = counts.cast(ty.int64).sum()
            if self._index.materialized:
                result_index = create_index_from_columns(
                    outputs[len(self._columns) :], volume, self._index.names
                )
            else:
                result_index = self._index.slice_by_bounds(bounds, storage)

            return self.replace_columns(
                outputs[: len(self._columns)], index=result_index
            )
        else:
            result_index = self._index.slice_by_bounds(bounds)
            return self.replace_columns([], index=result_index)

        return self
コード例 #14
0
ファイル: dataframe.py プロジェクト: nv-legate/legate.pandas
    def _get_columns_by_labels(self, key):
        key_scalar = is_scalar(key) or isinstance(key, tuple)
        keys = util.to_list_if_scalar(key)
        columns = self.columns

        # Validate keys
        for key in keys:
            if key not in columns:
                raise KeyError(key)

        indexer = columns.get_indexer_for(keys)
        new_self = self._slice_columns(indexer)
        if key_scalar:
            assert len(new_self.columns) == 1
            return new_self.squeeze(axis=1)
        else:
            return new_self
コード例 #15
0
ファイル: dataframe.py プロジェクト: nv-legate/legate.pandas
    def reset_index(self,
                    level=None,
                    drop=False,
                    inplace=False,
                    col_level=0,
                    col_fill=""):
        if inplace not in (
                True,
                False,
        ):
            raise err._invalid_value_error("inplace", inplace)
        if drop not in (
                True,
                False,
        ):
            raise err._invalid_value_error("drop", drop)

        if level is None:
            levels = list(range(self._raw_index.nlevels))
        else:
            levels = util.to_list_if_scalar(level)
            levels = [self._raw_index._get_level_number(lvl) for lvl in levels]
        # Pandas seems to ignore the order in which the levels are specified
        # but rather sorts them
        levels = sorted(levels)

        frame = self._frame.reset_index(levels, drop)
        columns = self.columns
        # FIXME: For now we will ignore the corner case where a column
        #        named index or level_0 already exists.
        if not drop:
            names = self._raw_index._get_level_names(levels)

            lev_num = columns._get_level_number(col_level)
            if isinstance(columns, pandas.MultiIndex):
                arrays = [[col_fill] * len(names)] * columns.nlevels
                arrays[lev_num] = names
                names = pandas.MultiIndex.from_arrays(arrays)
            else:
                names = pandas.Index(names)

            columns = names.append(columns)

        return self._create_or_update_frame(frame, inplace, columns=columns)
コード例 #16
0
    def sort_values(
        self,
        by,
        axis=0,
        ascending=True,
        inplace: bool = False,
        kind="quicksort",
        na_position="last",
        ignore_index: bool = False,
    ):
        axis = self._get_axis_number(axis)
        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        if na_position not in (
                "first",
                "last",
        ):
            raise err._invalid_value_error("na_position", na_position)

        by = util.to_list_if_scalar(by)
        ascending = self._get_ascending(ascending, len(by))
        if len(by) != len(ascending):
            raise ValueError(f"Length of ascending ({len(ascending)}) != "
                             f"length of by ({len(by)})")

        idxr = self.columns.get_indexer_for(by)
        if len(idxr) != len(by):
            for key in by:
                if len(by.count(key)) > 1:
                    raise ValueError("The column label '{key}' is not unique.")

        new_frame = self._frame.sort_values(
            idxr,
            axis,
            ascending,
            kind,
            na_position,
            ignore_index,
        )
        return self._create_or_update_frame(new_frame, inplace)
コード例 #17
0
ファイル: dataframe.py プロジェクト: nv-legate/legate.pandas
    def drop_duplicates(
        self,
        subset=None,
        keep="first",
        inplace=False,
        ignore_index=False,
    ):
        if subset is None:
            subset = list(range(len(self.columns)))
        else:
            subset = util.to_list_if_scalar(subset)
            idxr = self.columns.get_indexer_for(subset)
            mask = idxr == -1
            if mask.any():
                raise KeyError(list(np.compress(mask, subset)))
            subset = idxr

        if keep not in ("first", "last", False):
            raise ValueError("keep must be either 'first', 'last' or False")

        frame = self._frame.drop_duplicates(subset, keep, ignore_index)
        return self._create_or_update_frame(frame, inplace)
コード例 #18
0
    def to_csv(
        self,
        path_or_buf=None,
        sep=",",
        na_rep="",
        columns=None,
        header=True,
        index=True,
        line_terminator=None,
        chunksize=None,
        partition=False,
    ):
        if not isinstance(path_or_buf, str):
            raise err._unsupported_error("path must be a string for now")

        if len(sep) != 1:
            raise err._unsupported_error("separator must be a character")

        line_terminator = (os.linesep
                           if line_terminator is None else line_terminator)

        # The default chunk size is 8
        chunksize = 8 if chunksize is None else chunksize

        new_self = self
        if columns is not None:
            new_self = self[util.to_list_if_scalar(columns)]

        new_self._frame.to_csv(
            path=path_or_buf,
            sep=sep,
            na_rep=na_rep,
            header=header,
            index=index,
            line_terminator=line_terminator,
            chunksize=chunksize,
            partition=partition,
            column_names=new_self.columns.to_list(),
        )
コード例 #19
0
ファイル: series.py プロジェクト: nv-legate/legate.pandas
    def reset_index(self, level=None, drop=False, name=None, inplace=False):
        if inplace not in (
                True,
                False,
        ):
            raise err._invalid_value_error("inplace", inplace)
        if drop not in (
                True,
                False,
        ):
            raise err._invalid_value_error("drop", drop)

        if level is None:
            levels = list(range(self._raw_index.nlevels))
        else:
            levels = util.to_list_if_scalar(level)
            levels = [self._raw_index._get_level_number(lvl) for lvl in levels]
        # Pandas seems to ignore the order in which the levels are specified
        # but rather sorts them
        levels = sorted(levels)

        frame = self._frame.reset_index(levels, drop)
        if inplace and len(frame._columns) > 1:
            raise TypeError(
                "Cannot reset_index inplace on a Series to create a DataFrame")

        if drop:
            return self._create_or_update_frame(frame, inplace)

        if name is None:
            name = 0 if self.name is None else self.name
        names = self._raw_index._get_level_names(levels) + [name]
        columns = pandas.Index(names)

        from .dataframe import DataFrame

        return DataFrame(columns=columns, frame=frame)
コード例 #20
0
ファイル: table.py プロジェクト: nv-legate/legate.pandas
    def to_parquet(
        self,
        path,
        column_names,
        engine="auto",
        compression="snappy",
        index=None,
        partition_cols=None,
        **kwargs,
    ):
        token = self._create_directory(path)

        def _generate_pandas_metadata(
            table, column_names, index, materialized
        ):
            pandas_schema = table.to_pandas(schema_only=True)
            pandas_schema.columns = column_names

            index_descs = []
            if index is not False:
                if index is None and not materialized:
                    index_descs = [
                        {
                            "kind": "range",
                            "name": table._index.name,
                            "start": table._index.start,
                            "stop": table._index.stop,
                            "step": table._index.step,
                        }
                    ]
                else:
                    index_descs = [
                        f"__index_level_{level}__" if name is None else name
                        for level, name in enumerate(
                            util.to_list_if_scalar(table._index.name)
                        )
                    ]
                    column_names = index_descs + column_names

            if isinstance(pandas_schema.index, pandas.MultiIndex):
                index_levels = pandas_schema.index.levels
            else:
                index_levels = util.to_list_if_scalar(pandas_schema.index)

            from pyarrow import pandas_compat

            metadata = pandas_compat.construct_metadata(
                pandas_schema,
                column_names,
                index_levels,
                index_descs,
                index is not False,
                [col.dtype.to_arrow() for col in table._columns],
            )
            return metadata[str.encode("pandas")].decode(), index_descs

        materialized = self._index.materialized
        metadata, index_descs = _generate_pandas_metadata(
            self, column_names, index, materialized
        )

        columns = self._columns
        if index or (index is not False and materialized):
            columns = util.to_list_if_scalar(self._index.column) + columns
            column_names = index_descs + column_names
        assert len(columns) == len(column_names)

        compression = self._runtime.get_compression_type(compression)

        plan = Map(self._runtime, OpCode.TO_PARQUET)
        num_pieces = self._columns[0].num_pieces

        plan.add_future(token)
        plan.add_scalar_arg(num_pieces, ty.uint32)
        plan.add_scalar_arg(compression, ty.uint32)
        plan.add_scalar_arg(path, ty.string)
        plan.add_scalar_arg(metadata, ty.string)
        plan.add_scalar_arg(len(columns), ty.uint32)
        for column_name in column_names:
            plan.add_scalar_arg(column_name, ty.string)
        for column in columns:
            column.add_to_plan(plan, True)

        fm = plan.execute(self._columns[0].launch_domain)
        # TODO: Once we move the metadata generation to a Python task,
        #       we can avoid blocking here and instead chain the task
        #       to it.
        fm.wait()

        # TODO: We wlil move this post processing to a Python task and
        #       get rid of the use of shard id here.
        if self._runtime._this_is_first_node():
            import pyarrow.parquet as pq

            metadata = None
            num_digits = int(log10(num_pieces)) + 1
            for idx in range(num_pieces):
                part = f"part%0{num_digits}d.parquet" % idx
                md = pq.ParquetFile(os.path.sep.join([path, part])).metadata
                md.set_file_path(part)
                if metadata is None:
                    metadata = md
                else:
                    metadata.append_row_groups(md)
            metadata.write_metadata_file(os.path.sep.join([path, "_metadata"]))
コード例 #21
0
ファイル: table.py プロジェクト: nv-legate/legate.pandas
    def concat(self, axis, others, **kwargs):
        others = util.to_list_if_scalar(others)
        if axis == 1:
            columns = self._columns.copy()
            for other in others:
                columns.extend(other._columns)
            if len(self._columns) == 0:
                return Table(self._runtime, others[0]._index, columns)
            else:
                return Table(
                    self._runtime,
                    self._index,
                    columns,
                )
        else:
            assert axis == 0

            dfs = [self] + others
            num_dfs = len(dfs)
            result_storage = self._runtime.create_output_storage()
            partition_keys = self.partition_keys

            # FIXME: Here we assumed that dataframes have the same
            #        set of columns. When an input dataframe does
            #        not have a column that any of the inputs do,
            #        it is implicitly extended with a column of nulls
            #        while being concatenated.

            index_dtypes = util.to_list_if_scalar(self._index.dtype)
            value_dtypes = util.get_dtypes(self._columns)

            num_levels = len(index_dtypes)
            num_values = len(value_dtypes)

            all_index_columns = []
            all_value_columns = []

            # Hert the access to the internal _column memeber
            # of the self's index is intentional, as we want to
            # avoid materializing the index unnecessarily
            num_pieces = self._index._column.num_pieces
            for df in dfs:
                index_columns = util.to_list_if_scalar(df._index.column)
                all_index_columns.append(
                    [
                        column.repartition(num_pieces)
                        for column in index_columns
                    ]
                )
                all_value_columns.append(
                    [
                        df._columns[i].repartition(num_pieces)
                        for i in range(num_values)
                    ]
                )
                partition_keys = self.join_partition_keys(
                    partition_keys, df.partition_keys
                )

            nullable_index = [
                any(columns[i].nullable for columns in all_index_columns)
                for i in range(num_levels)
            ]
            nullable_value = [
                any(columns[i].nullable for columns in all_value_columns)
                for i in range(num_values)
            ]

            result_index_columns = result_storage.create_columns(
                index_dtypes, nullable=nullable_index
            )
            result_value_columns = result_storage.create_columns(
                value_dtypes, nullable=nullable_value
            )

            plan = Map(self._runtime, OpCode.CONCATENATE)

            plan.add_scalar_arg(num_levels + num_values, ty.uint32)
            for column in result_index_columns:
                column.add_to_plan_output_only(plan)
            for column in result_value_columns:
                column.add_to_plan_output_only(plan)
            plan.add_scalar_arg(num_dfs, ty.uint32)
            for i in range(num_dfs):
                for column in all_index_columns[i]:
                    column.add_to_plan(plan, True)
                for column in all_value_columns[i]:
                    column.add_to_plan(plan, True)

            launch_domain = self._index._column.launch_domain
            counts = plan.execute(launch_domain)

            result_storage = plan.promote_output_storage(result_storage)
            self._runtime.register_external_weighted_partition(
                result_storage.default_ipart, counts
            )
            del plan

            index_names = util.to_list_if_scalar(self._index.name)

            total_count = counts.cast(ty.int64).sum()
            result_index = create_index_from_columns(
                result_index_columns, total_count, index_names
            )

            result = Table(
                self._runtime,
                result_index,
                result_value_columns,
            )
            result.set_partition_keys(partition_keys)
            return result
コード例 #22
0
ファイル: dataframe.py プロジェクト: nv-legate/legate.pandas
 def _shuffle(self, keys):
     keys = util.to_list_if_scalar(keys)
     idxr = self.columns.get_indexer_for(keys)
     return self._create_or_update_frame(self._frame._shuffle(idxr), False)
コード例 #23
0
ファイル: table.py プロジェクト: nv-legate/legate.pandas
    def dropna(self, axis, idxr, thresh):
        assert axis == 0
        assert idxr is not None

        result_storage = self._runtime.create_output_storage()

        result_columns = []
        result_index_columns = []

        plan = Map(self._runtime, OpCode.DROPNA)

        plan.add_scalar_arg(thresh, ty.uint32)

        plan.add_scalar_arg(len(idxr), ty.uint32)
        for idx in idxr:
            plan.add_scalar_arg(idx, ty.int32)

        num_columns = len(self._columns)
        plan.add_scalar_arg(num_columns, ty.uint32)
        for i in range(num_columns):
            input = self._columns[i]

            output = result_storage.create_similar_column(input)
            result_columns.append(output)

            input.add_to_plan(plan, True)
            output.add_to_plan_output_only(plan)

        index_dtypes = util.to_list_if_scalar(self._index.dtype)
        plan.add_scalar_arg(len(index_dtypes), ty.uint32)

        input_index_materialized = self._index.materialized
        plan.add_scalar_arg(input_index_materialized, ty.bool)

        if input_index_materialized:
            input_index_columns = util.to_list_if_scalar(self._index.column)
            for input, index_dtype in zip(input_index_columns, index_dtypes):
                output = result_storage.create_column(
                    index_dtype, nullable=input.nullable
                )
                result_index_columns.append(output)

                input.add_to_plan(plan, True)
                output.add_to_plan_output_only(plan)
        else:
            plan.add_future(self._index._start)
            plan.add_future(self._index._step)
            for index_dtype in index_dtypes:
                output = result_storage.create_column(
                    index_dtype, nullable=False
                )
                output.add_to_plan_output_only(plan)
                result_index_columns.append(output)

        counts = plan.execute(self._columns[0].launch_domain)
        volume = counts.cast(ty.int64).sum()

        result_storage = plan.promote_output_storage(result_storage)
        self._runtime.register_external_weighted_partition(
            result_storage.default_ipart, counts
        )
        del plan

        result_index = create_index_from_columns(
            result_index_columns, volume, self._index.names
        )

        return self.replace_columns(result_columns, index=result_index)
コード例 #24
0
def read_parquet(path, columns, **kwargs):
    from legate.core import Rect

    from .runtime import _runtime as rt

    path = util.to_list_if_scalar(path)

    if len(path) == 1 and os.path.isdir(path[0]):
        from pyarrow.parquet import ParquetDataset

        ds = ParquetDataset(path)
        path = [piece.path for piece in ds.pieces]
    else:
        from pyarrow.parquet import ParquetFile

        ds = ParquetFile(path[0])
        if rt.debug:
            assert all(ParquetFile(p).schema == ds.schema for p in path)

    dedup_names = set()
    for name in ds.schema.names:
        if name in dedup_names:
            raise ValueError(
                "Duplicate column names in schema are not supported.")
        dedup_names.add(name)

    schema = ds.schema.to_arrow_schema()
    index_descs = []
    index_materialized = False
    if str.encode("pandas") in ds.metadata.metadata:
        import json

        pandas_metadata = json.loads(
            ds.metadata.metadata[str.encode("pandas")])
        index_descs = pandas_metadata["index_columns"]
        index_materialized = len(index_descs) > 0 and all(
            isinstance(desc, str) for desc in index_descs)

    if columns is None:
        column_names = schema.names
    elif index_materialized:
        column_names = columns + index_descs
    else:
        column_names = columns

    for name in column_names:
        if name not in dedup_names:
            raise ValueError("Field named %s not found in the schema." % name)
    schema = [schema.field(name) for name in column_names]
    del columns

    storage = rt.create_output_storage()
    offsets_storage = None

    columns = []
    for column_info in schema:
        dtype = ty.to_legate_dtype(column_info.type)
        column = storage.create_column(dtype)
        if ty.is_string_dtype(dtype):
            if offsets_storage is None:
                offsets_storage = rt.create_output_storage()
            offsets_column = offsets_storage.create_column(ty.int32,
                                                           nullable=False)
            chars_storage = rt.create_output_storage()
            char_column = chars_storage.create_column(ty.int8, nullable=False)
            column.add_child(offsets_column)
            column.add_child(char_column)
            column = column.as_string_column()
        columns.append(column)

    plan = Map(rt, OpCode.READ_PARQUET)
    plan.add_scalar_arg(len(path), ty.uint32)
    for f in path:
        plan.add_scalar_arg(f, ty.string)
    plan.add_scalar_arg(len(column_names), ty.uint32)
    for name in column_names:
        plan.add_scalar_arg(name, ty.string)
    plan.add_scalar_arg(len(columns), ty.uint32)
    for column in columns:
        column.add_to_plan_output_only(plan)
    counts = plan.execute(Rect([rt.num_pieces]))
    storage = plan.promote_output_storage(storage)
    rt.register_external_weighted_partition(storage.default_ipart, counts)
    del plan

    size = counts.cast(ty.int64).sum()

    if index_materialized:
        to_filter = set(index_descs)

        index_columns = []
        value_columns = []
        value_column_names = []
        for idx, name in enumerate(column_names):
            if name in to_filter:
                index_columns.append(columns[idx])
            else:
                value_columns.append(columns[idx])
                value_column_names.append(column_names[idx])

        sanitized_names = [
            None if name == f"__index_level_{level}__" else name
            for level, name in enumerate(index_descs)
        ]
        index = create_index_from_columns(index_columns, size, sanitized_names)
    else:
        value_columns = columns
        value_column_names = column_names
        if len(index_descs) > 0:
            assert len(index_descs) == 1
            index_desc = index_descs[0]
            name = index_desc["name"]
            start = rt.create_future(index_desc["start"], ty.int64)
            stop = rt.create_future(index_desc["stop"], ty.int64)
            step = rt.create_future(index_desc["step"], ty.int64)
            index = create_range_index(storage, size, name, start, stop, step)
        else:
            index = create_range_index(storage, size)

    from pandas import Index

    return {
        "frame": Table(rt, index, value_columns),
        "columns": Index(value_column_names),
    }
コード例 #25
0
def read_csv(
    filepath_or_buffer,
    sep=",",
    delimiter=None,
    header="infer",
    names=None,
    index_col=None,
    usecols=None,
    prefix=None,
    mangle_dupe_cols=True,
    dtype=None,
    true_values=None,
    false_values=None,
    skiprows=None,
    skipfooter=0,
    nrows=None,
    na_values=None,
    skip_blank_lines=True,
    parse_dates=False,
    compression="infer",
    quotechar='"',
    quoting=0,
    doublequote=True,
    verify_header=False,
    **kwargs,
    # TODO: Put back these options once we figure out how to support them
    #       with the Arrows CSV reader.
    # skipinitialspace=False,  # GPU only
    # keep_default_na=True,  # GPU only
    # na_filter=True,  # GPU only
    # dayfirst=False, # GPU only
    # thousands=None,  # GPU only
    # decimal=".",  # GPU only
    # lineterminator=None, # GPU only
    # comment=None,  # GPU only
    # delim_whitespace=False,  # GPU only
):

    # Checks on filepath_or_buffer
    paths = util.to_list_if_scalar(filepath_or_buffer)

    if any(not isinstance(path, str) for path in paths):
        raise err._unsupported_error(
            "'filepath_or_buffer' must be a string or a list of strings")
    if len(paths) == 0:
        raise ValueError("'filepath_or_buffer' must be a non-empty list")

    for path in paths:
        if not os.path.exists(path):
            raise ValueError(f"{path} does not exist")

    if not isinstance(compression, str):
        raise err._unsupported_error("compression", compression)
    compressions = [
        _parse_compression(infer_compression(path, compression))
        for path in paths
    ]

    # Checks on sep and delimiter
    if sep is None and delimiter is None:
        raise ValueError("at least one of 'sep' or 'delimiter' must be given")
    sep = delimiter if delimiter is not None else sep
    if len(sep) > 1:
        raise ValueError("'sep' must be a 1-character string")

    # Checks on sep and delimiter
    if header == "infer":
        header = 0 if names is None else None

    if header not in (
            0,
            None,
    ):
        raise err._unsupported_error("header", header)

    # Checks on skiprows, kipfooter, and nrows
    skiprows = 0 if skiprows is None else skiprows
    if not is_integer(skiprows):
        raise ValueError("'skiprows' must be an integer")
    if not is_integer(skipfooter):
        raise ValueError("'skipfooter' must be an integer")
    if not (nrows is None or is_integer(nrows)):
        raise ValueError("'nrows' must be None or an integer")

    # If either column names or dtype is missing, infer them by parsing
    # the first few of lines using Pandas
    # FIXME: We should use cuDF for this
    if names is None or dtype is None:
        engine = ("python" if skipfooter > 0 else "c", )
        column_names, dtypes = _extract_header_using_pandas(
            paths[0],
            sep,
            header,
            names,
            dtype,
            true_values,
            false_values,
            skiprows,
            na_values,
            skip_blank_lines,
            parse_dates,
            compression,
            quotechar,
            quoting,
            doublequote,
            engine,
            peek_rows=3,
        )
        if verify_header:
            for path in paths[1:]:
                result = _extract_header_using_pandas(
                    path,
                    sep,
                    header,
                    names,
                    dtype,
                    true_values,
                    false_values,
                    skiprows,
                    na_values,
                    skip_blank_lines,
                    parse_dates,
                    compression,
                    quotechar,
                    quoting,
                    doublequote,
                    engine,
                    peek_rows=3,
                )
                if not column_names.equals(result[0]):
                    raise ValueError(
                        f"{paths[0]} and {path} have different headers")

    else:
        column_names = pandas.Index(names)

        if is_dict_like(dtype):
            dtypes = []
            for name in names:
                if name not in dtype:
                    raise ValueError(f"'dtype' has no entry for '{name}'")
                dtypes.append(_ensure_dtype(dtype[name]))
        elif is_list_like(dtype):
            raise err._unsupported_error(
                "'dtype' must be a string, a dtype, or a dictionary")
        else:
            dtype = _ensure_dtype(dtype)
            dtypes = [dtype] * len(names)

    if column_names.has_duplicates:
        raise ValueError("Header must not have any duplicates")

    # Checks on unsupported options
    if prefix is not None:
        raise err._unsupported_error("prefix", prefix)
    if mangle_dupe_cols not in (True, ):
        raise err._unsupported_error("mangle_dupe_cols", mangle_dupe_cols)

    # If there was a header in the file, we should skip that line as well
    if header == 0:
        skiprows += 1

    # Checks on parse_dates
    _ERR_MSG_PARSE_DATES = (
        "'parse_dates' must be a list of integers or strings for now")

    if is_dict_like(parse_dates):
        raise err._unsupported_error(_ERR_MSG_PARSE_DATES)

    parse_dates = parse_dates if parse_dates is not False else []
    if not is_list_like(parse_dates):
        raise err._unsupported_error(_ERR_MSG_PARSE_DATES)

    date_cols = _get_indexer(column_names, parse_dates, "parse_dates")

    # Override dtypes for the datetime columns
    for idx in date_cols:
        dtypes[idx] = ty.ts_ns

    # If a column is given a datetime dtype but not added to the parse_dates,
    # we should record it
    for idx, dtype in enumerate(dtypes):
        if idx not in parse_dates:
            parse_dates.append(idx)

    # Checks on quoting
    if quoting != 0:
        raise err._unsupported_error("quoting", quoting)
    if len(quotechar) > 1:
        raise ValueError("'quotechar' must be a 1-character string")

    # Checks on index_col
    index_col = None if index_col is False else index_col
    if index_col is not None:
        if is_integer(index_col) or isinstance(index_col, str):
            index_col = [index_col]
        if not is_list_like(index_col):
            raise err._unsupported_error("index_col", index_col)
        index_col = _get_indexer(column_names, index_col, "index_col")

    # Checks on true_values, false_values, and na_values
    _check_string_list(true_values, "true_values")
    _check_string_list(false_values, "false_values")
    _check_string_list(na_values, "na_values")

    # Checks on nrows
    if skipfooter != 0 and nrows is not None:
        raise ValueError("'skipfooter' not supported with 'nrows'")

    df = DataFrame(
        frame=io.read_csv(
            paths,
            sep=sep,
            usecols=usecols,
            dtypes=dtypes,
            true_values=true_values,
            false_values=false_values,
            skiprows=skiprows,
            skipfooter=skipfooter,
            nrows=nrows,
            na_values=na_values,
            skip_blank_lines=skip_blank_lines,
            date_cols=date_cols,
            compressions=compressions,
            quotechar=quotechar,
            quoting=quoting,
            doublequote=doublequote,
        ),
        columns=column_names,
    )

    if index_col is not None:
        df = df.set_index(column_names[index_col])
        # Make sure we reset the names for unnamed indices
        names = df._raw_index.names
        names = [
            None if name.startswith("Unnamed") else name for name in names
        ]
        df._raw_index.names = names

    return df
コード例 #26
0
ファイル: groupby.py プロジェクト: nv-legate/legate.pandas
    def __init__(self, df, by, axis, level, as_index, sort, method,
                 is_series_groupby):
        axis = df._get_axis_number(axis)
        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        if by is None and level is None:
            raise TypeError("You have to supply one of 'by' and 'level'")

        self._df = df
        self._axis = axis
        self._as_index = as_index
        self._sort = sort
        self._method = method
        self._is_series_groupby = is_series_groupby

        if level is not None:
            levels = util.to_list_if_scalar(level)
            self._keys = [
                df._raw_index._get_level_number(lvl) for lvl in levels
            ]

            # Reset the levels chosen as the groupby keys so that they
            # appear in the frame
            self._df = self._df.reset_index(self._keys)

            # The pushed-out index levels are now the first few columns
            # in the frame, so we should change the key indices to pick
            # them correctly as the groupby keys later

            # A technical note: reset_index internally sorts level
            # numbers before it pushes out the corresponding levels
            # to the dataframe. Therefore, we use argsort to compute
            # the positions of the columns that we later pick for indices.
            self._keys = [
                p[0] for p in sorted(enumerate(self._keys), key=lambda p: p[1])
            ]
            self._levels = self._keys

        else:
            if df._is_series:
                raise err._unsupported_error(
                    f"{type(self._df).__name__} only supports level")

            keys = util.to_list_if_scalar(by)
            if all(not isinstance(key, str) for key in keys):
                raise err._unsupported_error(
                    "groupby keys must be column names for now")

            idxr = []
            columns = df._get_columns()
            for key in keys:
                idx = columns.get_indexer_for([key])
                if len(idx) > 1:
                    raise KeyError(f"ambiguous key name {key}")
                if idx[0] == -1:
                    raise KeyError(key)
                idxr.extend(idx)

            self._keys = idxr
            self._levels = []
コード例 #27
0
ファイル: table.py プロジェクト: nv-legate/legate.pandas
    def select(self, mask):
        if isinstance(mask, Table):
            assert len(mask._columns) == 1
            mask = mask._columns[0]

        if self._runtime.debug:
            assert isinstance(mask, Column)
            assert mask.dtype == ty.bool

        result_storage = self._runtime.create_output_storage()

        result_columns = []
        result_index_columns = []

        plan_compact = Map(self._runtime, OpCode.COMPACT)

        mask.add_to_plan(plan_compact, True)

        num_columns = len(self._columns)
        plan_compact.add_scalar_arg(num_columns, ty.uint32)
        for i in range(num_columns):
            input = self._columns[i]

            output = result_storage.create_similar_column(input)
            result_columns.append(output)

            input.add_to_plan(plan_compact, True)
            output.add_to_plan_output_only(plan_compact)

        index_dtypes = util.to_list_if_scalar(self._index.dtype)
        plan_compact.add_scalar_arg(len(index_dtypes), ty.uint32)

        input_index_materialized = self._index.materialized
        plan_compact.add_scalar_arg(input_index_materialized, ty.bool)

        if input_index_materialized:
            input_index_columns = util.to_list_if_scalar(self._index.column)
            for input, index_dtype in zip(input_index_columns, index_dtypes):
                output = result_storage.create_column(
                    index_dtype, nullable=input.nullable
                )
                result_index_columns.append(output)

                input.add_to_plan(plan_compact, True)
                output.add_to_plan_output_only(plan_compact)
        else:
            plan_compact.add_future(self._index._start)
            plan_compact.add_future(self._index._step)
            for index_dtype in index_dtypes:
                output = result_storage.create_column(
                    index_dtype, nullable=False
                )
                output.add_to_plan_output_only(plan_compact)
                result_index_columns.append(output)

        counts = plan_compact.execute(mask.launch_domain)
        volume = counts.cast(ty.int64).sum()

        result_storage = plan_compact.promote_output_storage(result_storage)
        self._runtime.register_external_weighted_partition(
            result_storage.default_ipart, counts
        )
        del plan_compact

        result_index = create_index_from_columns(
            result_index_columns, volume, self._index.names
        )

        return self.replace_columns(result_columns, index=result_index)
コード例 #28
0
    def drop(
        self,
        labels=None,
        axis=0,
        index=None,
        columns=None,
        level=None,
        inplace=False,
        errors="raise",
    ):
        # If 'labels' is set, we use 'axis' to determine the lookup axis
        if labels is not None:
            if index is not None or columns is not None:
                raise ValueError(
                    "Cannot specify both 'labels' and 'index'/'columns'")
            axis = self._get_axis_number(axis)

            if axis == 0:
                row_labels = util.to_list_if_scalar(labels)
                row_level = level
                col_labels = []
                col_level = None
            else:
                row_labels = []
                row_level = None
                col_labels = util.to_list_if_scalar(labels)
                col_level = level

        # Otherwise, we use 'columns' and 'index' as lookup labels
        else:
            if not self._is_series and columns is not None:
                col_labels = util.to_list_if_scalar(columns)
                col_level = level
            if index is not None:
                row_labels = util.to_list_if_scalar(index)
                row_level = level

        def _validate_labels(index, labels, level, membership=True):
            for label in labels:
                if not util.is_tuple(label):
                    continue
                if len(label) > index.nlevels:
                    raise KeyError(f"Key length ({len(label)}) exceeds "
                                   f"index depth ({index.nlevels})")

            if not membership:
                return

            if level is not None:
                level = index._get_level_number(level)
                index = index.get_level_values(level)

            for label in labels:
                if label not in index:
                    raise KeyError(label)

        new_self = self.copy(deep=False)

        # Drop columns first as that's easier
        if len(col_labels) > 0:
            assert not new_self._is_series
            _validate_labels(new_self.columns, col_labels, col_level)
            columns = new_self.columns.drop(col_labels, level)
            idxr = new_self.columns.get_indexer_for(columns)
            new_self = new_self._slice_columns(idxr)

        # Then drop rows using selection
        if len(row_labels) > 0:
            _validate_labels(new_self._raw_index, row_labels, row_level, False)

            if len(row_labels) > 1:
                raise err._unsupported_error("Label must be a scalar for now")
            row_label = row_labels[0]

            if level is not None and not is_scalar(row_label):
                raise ValueError("label must be a scalar when 'level' is set")

            if util.is_tuple(row_label) and len(row_label) == 0:
                raise ValueError("label must not be empty")

            mask = new_self._raw_index._get_drop_mask_for(row_label, level)
            new_frame = new_self._frame.select(mask)
            new_self._frame = new_frame

        if inplace:
            if self._is_series:
                self._update_frame(new_self._frame)
            else:
                self._update_frame(new_self._frame, columns=new_self.columns)

        else:
            return new_self