Beispiel #1
0
    def _sortjoin(self, other, how='left', return_indexers=False):
        """Join with another column.

        When the column is a index, set *return_indexers* to obtain
        the indices for shuffling the remaining columns.
        """
        from cudf.dataframe.series import Series

        if not self.is_type_equivalent(other):
            raise TypeError('*other* is not compatible')

        lkey, largsort = self.sort_by_values(True)
        rkey, rargsort = other.sort_by_values(True)
        with _gdf.apply_join(
                [lkey], [rkey], how=how, method='sort') as (lidx, ridx):
            if lidx.size > 0:
                raw_index = cudautils.gather_joined_index(
                        lkey.to_gpu_array(),
                        rkey.to_gpu_array(),
                        lidx,
                        ridx,
                        )
                buf_index = Buffer(raw_index)
            else:
                buf_index = Buffer.null(dtype=self.dtype)

            joined_index = lkey.replace(data=buf_index)

            if return_indexers:
                def gather(idxrange, idx):
                    mask = (Series(idx) != -1).as_mask()
                    return idxrange.take(idx).set_mask(mask).fillna(-1)

                if len(joined_index) > 0:
                    indexers = (
                            gather(Series(largsort), lidx),
                            gather(Series(rargsort), ridx),
                            )
                else:
                    indexers = (
                            Series(Buffer.null(dtype=np.intp)),
                            Series(Buffer.null(dtype=np.intp))
                            )
                return joined_index, indexers
            else:
                return joined_index
Beispiel #2
0
    def _hashjoin(self, other, how='left', return_indexers=False):

        from cudf.dataframe.series import Series

        if not self.is_type_equivalent(other):
            raise TypeError('*other* is not compatible')

        with _gdf.apply_join(
                [self], [other], how=how, method='hash') as (lidx, ridx):
            if lidx.size > 0:
                raw_index = cudautils.gather_joined_index(
                        self.to_gpu_array(),
                        other.to_gpu_array(),
                        lidx,
                        ridx,
                        )
                buf_index = Buffer(raw_index)
            else:
                buf_index = Buffer.null(dtype=self.dtype)

            joined_index = self.replace(data=buf_index)

            if return_indexers:
                def gather(idxrange, idx):
                    mask = (Series(idx) != -1).as_mask()
                    return idxrange.take(idx).set_mask(mask).fillna(-1)

                if len(joined_index) > 0:
                    indexers = (
                            gather(Series(range(0, len(self))), lidx),
                            gather(Series(range(0, len(other))), ridx),
                            )
                else:
                    indexers = (
                            Series(Buffer.null(dtype=np.intp)),
                            Series(Buffer.null(dtype=np.intp))
                            )
                return joined_index, indexers
            else:
                return joined_index