Exemple #1
0
 def _create_index_space(self, rect):
     if not isinstance(rect, PandasFuture):
         if not isinstance(rect, Rect):
             rect = Rect([rect])
         handle = legion.legion_index_space_create_domain(
             self._runtime, self._context, rect.raw())
     else:
         domain = self.launch_future_task(OpCode.LIFT_TO_DOMAIN, rect)
         handle = legion.legion_index_space_create_future(
             self._runtime, self._context, 1, domain.handle, 0)
     return IndexSpace(self._context, self._runtime, handle=handle)
Exemple #2
0
def _drop_duplicates_one_step(runtime, inputs, subset, keep, radix=1):
    storage = runtime.create_output_storage()

    outputs = [storage.create_similar_column(column) for column in inputs]

    num_pieces = (inputs[0].num_pieces + radix - 1) // radix
    launch_domain = Rect([num_pieces])

    plan = Map(runtime, OpCode.DROP_DUPLICATES_TREE)
    plan.add_scalar_arg(keep.value, ty.int32)
    plan.add_scalar_arg(len(subset), ty.uint32)
    for idx in subset:
        plan.add_scalar_arg(idx, ty.int32)

    plan.add_scalar_arg(radix, ty.uint32)
    for r in range(radix):
        plan.add_scalar_arg(len(inputs), ty.uint32)
        proj_id = runtime.get_radix_functor_id(radix, r)
        for input in inputs:
            input.add_to_plan(plan, True, proj=proj_id)

    plan.add_scalar_arg(len(outputs), ty.uint32)
    for output in outputs:
        output.add_to_plan_output_only(plan)
    counts = plan.execute(launch_domain)

    storage = plan.promote_output_storage(storage)
    return (outputs, storage, counts, outputs[0].num_pieces)
Exemple #3
0
    def _construct_groupby_output(self):

        result = self._perform_reduction()

        if self._method == GroupbyVariantCode.HASH:
            # The input table is already partitioned so that chunks have
            # disjoint keys, so we only need a single round of reduction
            return result

        elif self._method == GroupbyVariantCode.TREE:
            # If we do tree-based reduction, we need to repeat reduction
            # rounds until we reach the root of the tree
            self._radix = self._runtime.radix
            while self._num_pieces > 1:
                (self._keys, self._values, total_count) = result

                self._num_pieces = (self._num_pieces + self._radix -
                                    1) // self._radix
                self._launch_domain = Rect([self._num_pieces])
                self._cspace = self._runtime.find_or_create_color_space(
                    self._num_pieces)
                result = self._perform_reduction()

            return result

        else:
            assert False
Exemple #4
0
    def to_category_column_cpu(self, dtype):
        rt = self.runtime

        nullable = dtype is not None or self.nullable
        if dtype is None:
            # Local de-duplication
            storage = rt.create_output_storage()
            result_column = storage.create_similar_column(self, nullable=False)

            plan = Map(rt, OpCode.DROP_DUPLICATES_CATEGORIES)
            plan.add_scalar_arg(1, ty.uint32)
            result_column.add_to_plan_output_only(plan)
            self.add_to_plan(plan, True)
            plan.execute(self.launch_domain)
            del plan

            radix = rt.radix
            num_pieces = result_column.num_pieces
            while num_pieces > 1:
                # Global de-duplication
                num_pieces = (num_pieces + radix - 1) // radix
                local_dedup_column = result_column

                storage = rt.create_output_storage()
                result_column = storage.create_similar_column(self,
                                                              nullable=False)

                plan = Map(rt, OpCode.DROP_DUPLICATES_CATEGORIES)
                plan.add_scalar_arg(radix, ty.uint32)
                result_column.add_to_plan_output_only(plan)
                for r in range(radix):
                    proj_id = rt.get_radix_functor_id(radix, r)
                    local_dedup_column.add_to_plan(plan, True, proj=proj_id)
                launch_domain = Rect([num_pieces])
                plan.execute(launch_domain)
                del plan

            categories_column = result_column.as_replicated_column()
            dtype = ty.CategoricalDtype(categories_column)

        encode_result = self.storage.create_column(dtype,
                                                   ipart=self.primary_ipart,
                                                   nullable=nullable)
        encode_result.add_child(
            self.storage.create_column(
                ty.uint32,
                ipart=self.primary_ipart,
                nullable=False,
            ))

        plan = Map(rt, OpCode.ENCODE)
        encode_result.add_to_plan_output_only(plan)
        dtype.categories_column.add_to_plan(plan, True)
        self.add_to_plan(plan, True)
        plan.execute(self.launch_domain)
        del plan

        encode_result.add_child(dtype.categories_column)
        return encode_result.as_category_column()
Exemple #5
0
    def _shuffle_columns(self):
        (self._key_columns, self._sample_columns) = self._sample_keys()

        rt = self._runtime

        cspace = self._input_columns[0].cspace

        hist_ispace = rt.find_or_create_index_space(
            Rect([self._num_pieces, self._num_pieces]))
        hist_storage = rt.create_storage(hist_ispace)
        hist = hist_storage.create_new_field(ty.range64)
        hist_ipart = rt.create_row_partition(hist_ispace, cspace,
                                             self._num_pieces)

        # Build histogram using samples. Each point task
        # gets the whole set of samples and sorts them independently.
        plan = Map(rt, OpCode.BUILD_HISTOGRAM)

        plan.add_scalar_arg(self._num_pieces, ty.uint32)
        plan.add_scalar_arg(self._put_null_first, ty.bool)
        plan.add_scalar_arg(len(self._key_columns), ty.uint32)
        for asc in self._ascending:
            plan.add_scalar_arg(asc, ty.bool)
        # Need to broadcast the whole sample region
        samples = [sample.repartition(1) for sample in self._sample_columns]
        for column in samples:
            column.add_to_plan(plan, True, proj=None)
        for column in self._key_columns:
            column.add_to_plan(plan, True)

        plan.add_output(
            hist,
            Projection(hist_ipart),
            tag=PandasMappingTag.HISTOGRAM,
            flags=2,  # LEGION_NO_ACCESS_FLAG
        )

        plan.execute(self._launch_domain)
        del plan

        hist_ipart = rt.create_column_partition(hist_ispace, cspace,
                                                self._num_pieces)
        radix_ipart = rt.create_partition_by_image(
            self._input_columns[0].ispace,
            cspace,
            hist,
            hist_ipart,
            kind=legion.DISJOINT_COMPLETE_KIND,
            range=True,
        )

        # Change the primary partitions to shuffle the data
        input_columns = [
            column.all_to_ranges().clone() for column in self._input_columns
        ]
        for column in input_columns:
            column.set_primary_ipart(radix_ipart)
        input_columns = [column.all_to_offsets() for column in input_columns]
        return input_columns
Exemple #6
0
 def _preload_libcudf(self):
     task = IndexTask(
         self.get_task_id(OpCode.LIBCUDF_INIT),
         Rect([self.num_pieces]),
         argmap=self.empty_argmap,
         mapper=self.mapper_id,
     )
     self.dispatch(task).wait()
Exemple #7
0
 def _finalize_nccl(self):
     task = IndexTask(
         self.get_task_id(OpCode.FINALIZE_NCCL),
         Rect([self.num_pieces]),
         argmap=self.empty_argmap,
         mapper=self.mapper_id,
     )
     nccl_comm = self._nccl_comm._future_map
     task.add_point_future(ArgumentMap(future_map=nccl_comm))
     self.dispatch(task).wait()
Exemple #8
0
    def _initialize_nccl(self):
        task = Task(
            self.get_task_id(OpCode.INIT_NCCL_ID),
            mapper=self.mapper_id,
        )
        self._nccl_id = self.dispatch(task)

        task = IndexTask(
            self.get_task_id(OpCode.INIT_NCCL),
            Rect([self.num_pieces]),
            argmap=self.empty_argmap,
            mapper=self.mapper_id,
        )
        task.add_future(self._nccl_id)
        self.issue_fence()
        self._nccl_comm = self.dispatch(task).cast(ty.uint64)
        self.issue_fence()
Exemple #9
0
 def find_or_create_column_partition(self, cspace, num_columns):
     if cspace not in self._column_partitions:
         transform = Transform(2, 1)
         transform.trans[0, 0] = 1
         transform.trans[1, 0] = 0
         extent = Rect([1, num_columns])
         partitioner = PartitionByRestriction(transform, extent)
         part_id = self._next_column_partition_id
         self._next_column_partition_id = part_id + 1
         ipart = IndexPartition(
             self._legion_context,
             self._legion_runtime,
             self._ispace,
             cspace,
             partitioner,
             kind=legion.DISJOINT_COMPLETE_KIND,
             part_id=part_id,
         )
         self._column_partitions[cspace] = ipart
     return self._column_partitions[cspace]
    def _hash_partition_cpu(self, columns, key_indices, needs_conversion):
        storage = self._runtime.create_storage(columns[0].ispace)
        out_columns = storage.create_isomorphic_columns(columns)

        _key_indices = list(key_indices)
        for idx in needs_conversion:
            _key_indices[key_indices.index(idx)] = len(columns)
            columns.append(columns[idx].astype(ty.string))
        key_indices = _key_indices

        num_pieces = columns[0].num_pieces
        launch_domain = columns[0].launch_domain
        cspace = columns[0].cspace

        hist_ispace = self._runtime.find_or_create_index_space(
            Rect([num_pieces, num_pieces]))
        hist_storage = self._runtime.create_storage(hist_ispace)
        hist = hist_storage.create_new_field(ty.range64)
        hist_ipart = self._runtime.create_row_partition(
            hist_ispace, cspace, num_pieces)

        plan = Map(self._runtime, OpCode.LOCAL_PARTITION)

        plan.add_output(
            hist,
            Projection(hist_ipart),
            tag=PandasMappingTag.HISTOGRAM,
            flags=2,  # LEGION_NO_ACCESS_FLAG
        )

        plan.add_scalar_arg(num_pieces, ty.uint32)

        plan.add_scalar_arg(len(key_indices), ty.uint32)
        for idx in key_indices:
            plan.add_scalar_arg(idx, ty.int32)

        plan.add_scalar_arg(len(columns), ty.uint32)
        for key in columns:
            key.add_to_plan(plan, True)
        plan.add_scalar_arg(len(out_columns), ty.uint32)
        for key in out_columns:
            key.add_to_plan_output_only(plan)

        plan.execute(launch_domain)
        del plan

        hist_ipart = self._runtime.create_column_partition(
            hist_ispace, cspace, num_pieces)
        radix_ipart = self._runtime.create_partition_by_image(
            columns[0].ispace,
            cspace,
            hist,
            hist_ipart,
            kind=legion.DISJOINT_COMPLETE_KIND,
            range=True,
        )

        out_columns = [
            out_column.all_to_ranges().clone() for out_column in out_columns
        ]
        for out_column in out_columns:
            out_column.set_primary_ipart(radix_ipart)
        out_columns = [
            out_column.all_to_offsets() for out_column in out_columns
        ]

        return out_columns
Exemple #11
0
 def launch_domain(self):
     return Rect([self.num_pieces])
Exemple #12
0
def read_parquet(path, columns, **kwargs):
    from legate.core import Rect

    from .runtime import _runtime as rt

    path = util.to_list_if_scalar(path)

    if len(path) == 1 and os.path.isdir(path[0]):
        from pyarrow.parquet import ParquetDataset

        ds = ParquetDataset(path)
        path = [piece.path for piece in ds.pieces]
    else:
        from pyarrow.parquet import ParquetFile

        ds = ParquetFile(path[0])
        if rt.debug:
            assert all(ParquetFile(p).schema == ds.schema for p in path)

    dedup_names = set()
    for name in ds.schema.names:
        if name in dedup_names:
            raise ValueError(
                "Duplicate column names in schema are not supported.")
        dedup_names.add(name)

    schema = ds.schema.to_arrow_schema()
    index_descs = []
    index_materialized = False
    if str.encode("pandas") in ds.metadata.metadata:
        import json

        pandas_metadata = json.loads(
            ds.metadata.metadata[str.encode("pandas")])
        index_descs = pandas_metadata["index_columns"]
        index_materialized = len(index_descs) > 0 and all(
            isinstance(desc, str) for desc in index_descs)

    if columns is None:
        column_names = schema.names
    elif index_materialized:
        column_names = columns + index_descs
    else:
        column_names = columns

    for name in column_names:
        if name not in dedup_names:
            raise ValueError("Field named %s not found in the schema." % name)
    schema = [schema.field(name) for name in column_names]
    del columns

    storage = rt.create_output_storage()
    offsets_storage = None

    columns = []
    for column_info in schema:
        dtype = ty.to_legate_dtype(column_info.type)
        column = storage.create_column(dtype)
        if ty.is_string_dtype(dtype):
            if offsets_storage is None:
                offsets_storage = rt.create_output_storage()
            offsets_column = offsets_storage.create_column(ty.int32,
                                                           nullable=False)
            chars_storage = rt.create_output_storage()
            char_column = chars_storage.create_column(ty.int8, nullable=False)
            column.add_child(offsets_column)
            column.add_child(char_column)
            column = column.as_string_column()
        columns.append(column)

    plan = Map(rt, OpCode.READ_PARQUET)
    plan.add_scalar_arg(len(path), ty.uint32)
    for f in path:
        plan.add_scalar_arg(f, ty.string)
    plan.add_scalar_arg(len(column_names), ty.uint32)
    for name in column_names:
        plan.add_scalar_arg(name, ty.string)
    plan.add_scalar_arg(len(columns), ty.uint32)
    for column in columns:
        column.add_to_plan_output_only(plan)
    counts = plan.execute(Rect([rt.num_pieces]))
    storage = plan.promote_output_storage(storage)
    rt.register_external_weighted_partition(storage.default_ipart, counts)
    del plan

    size = counts.cast(ty.int64).sum()

    if index_materialized:
        to_filter = set(index_descs)

        index_columns = []
        value_columns = []
        value_column_names = []
        for idx, name in enumerate(column_names):
            if name in to_filter:
                index_columns.append(columns[idx])
            else:
                value_columns.append(columns[idx])
                value_column_names.append(column_names[idx])

        sanitized_names = [
            None if name == f"__index_level_{level}__" else name
            for level, name in enumerate(index_descs)
        ]
        index = create_index_from_columns(index_columns, size, sanitized_names)
    else:
        value_columns = columns
        value_column_names = column_names
        if len(index_descs) > 0:
            assert len(index_descs) == 1
            index_desc = index_descs[0]
            name = index_desc["name"]
            start = rt.create_future(index_desc["start"], ty.int64)
            stop = rt.create_future(index_desc["stop"], ty.int64)
            step = rt.create_future(index_desc["step"], ty.int64)
            index = create_range_index(storage, size, name, start, stop, step)
        else:
            index = create_range_index(storage, size)

    from pandas import Index

    return {
        "frame": Table(rt, index, value_columns),
        "columns": Index(value_column_names),
    }
Exemple #13
0
def read_csv(
    paths,
    sep=None,
    usecols=None,
    dtypes=None,
    true_values=None,
    false_values=None,
    skiprows=0,
    skipfooter=0,
    nrows=None,
    na_values=None,
    skip_blank_lines=True,
    date_cols=False,
    compressions=None,
    quotechar='"',
    quoting=0,
    doublequote=True,
):
    from legate.core import Rect

    from .runtime import _runtime as rt

    storage = rt.create_output_storage()
    offsets_storage = None

    # Override the dtype for category columns, as they are not directly
    # handled by the CSV reader
    storage_dtypes = [
        ty.string if dtype == "category" else dtype for dtype in dtypes
    ]
    columns = [storage.create_column(dtype) for dtype in storage_dtypes]
    for column in columns:
        if ty.is_string_dtype(column.dtype):
            if offsets_storage is None:
                offsets_storage = rt.create_output_storage()
            offsets_column = offsets_storage.create_column(ty.int32,
                                                           nullable=False)
            chars_storage = rt.create_output_storage()
            char_column = chars_storage.create_column(ty.int8, nullable=False)
            column.add_child(offsets_column)
            column.add_child(char_column)
    columns = [
        column.as_string_column()
        if ty.is_string_dtype(column.dtype) else column for column in columns
    ]

    # TODO: Since Arrow doesn't support in-flight decompression, we decompress
    #       any compressed files before tossing them to the reader.
    to_remove = []
    if not rt.has_gpus:
        paths, compressions, to_remove = _uncompress_files(paths, compressions)

    plan = Map(rt, OpCode.READ_CSV)
    plan.add_scalar_arg(len(paths), ty.uint32)
    for path in paths:
        plan.add_scalar_arg(path, ty.string)
    plan.add_scalar_arg(len(compressions), ty.uint32)
    for compression in compressions:
        plan.add_scalar_arg(compression.value, ty.int32)
    plan.add_scalar_arg(sep, ty.string)
    plan.add_scalar_arg(skiprows, ty.int32)
    plan.add_scalar_arg(skipfooter, ty.int32)
    _may_add_to_plan(plan, nrows, ty.int32)
    plan.add_scalar_arg(quotechar, ty.string)
    plan.add_scalar_arg(doublequote, ty.bool)
    plan.add_scalar_arg(skip_blank_lines, ty.bool)
    _may_add_to_plan(plan, true_values, ty.string)
    _may_add_to_plan(plan, false_values, ty.string)
    _may_add_to_plan(plan, na_values, ty.string)
    plan.add_scalar_arg(len(columns), ty.uint32)
    for column in columns:
        column.add_to_plan_output_only(plan)
    plan.add_scalar_arg(len(date_cols), ty.uint32)
    for idx in date_cols:
        plan.add_scalar_arg(idx, ty.int32)
    counts = plan.execute(Rect([rt.num_pieces]))
    storage = plan.promote_output_storage(storage)
    rt.register_external_weighted_partition(storage.default_ipart, counts)
    del plan

    columns = [
        column.to_category_column() if dtype == "category" else column
        for column, dtype in zip(columns, dtypes)
    ]

    size = counts.cast(ty.int64).sum()
    index = create_range_index(storage, size)

    if len(to_remove) > 0:
        counts.wait()
        for path in to_remove:
            os.remove(path)

    return Table(rt, index, columns)