Esempio n. 1
0
 def normalize_binop_value(
     self, other: ScalarLike
 ) -> Union[ColumnBase, ScalarLike]:
     if other is None:
         return other
     if isinstance(other, cudf.Scalar):
         if self.dtype == other.dtype:
             return other
         # expensive device-host transfer just to
         # adjust the dtype
         other = other.value
     elif isinstance(other, np.ndarray) and other.ndim == 0:
         other = other.item()
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in {"b", "i", "u", "f"}:
         if isinstance(other, cudf.Scalar):
             return other
         other_dtype = np.promote_types(self.dtype, other_dtype)
         if other_dtype == np.dtype("float16"):
             other_dtype = np.dtype("float32")
             other = other_dtype.type(other)
         if self.dtype.kind == "b":
             other_dtype = min_signed_type(other)
         if np.isscalar(other):
             other = np.dtype(other_dtype).type(other)
             return other
         else:
             ary = utils.scalar_broadcast_to(
                 other, size=len(self), dtype=other_dtype
             )
             return column.build_column(
                 data=Buffer(ary), dtype=ary.dtype, mask=self.mask,
             )
     else:
         raise TypeError(f"cannot broadcast {type(other)}")
Esempio n. 2
0
 def find_and_replace(self, to_replace, replacement, all_nan):
     """
     Return col with *to_replace* replaced with *value*.
     """
     to_replace_col = _normalize_find_and_replace_input(
         self.dtype, to_replace
     )
     if all_nan:
         replacement_col = column.as_column(replacement, dtype=self.dtype)
     else:
         replacement_col = _normalize_find_and_replace_input(
             self.dtype, replacement
         )
     if len(replacement_col) == 1 and len(to_replace_col) > 1:
         replacement_col = column.as_column(
             utils.scalar_broadcast_to(
                 replacement[0], (len(to_replace_col),), self.dtype
             )
         )
     replaced = self.copy()
     to_replace_col, replacement_col, replaced = numeric_normalize_types(
         to_replace_col, replacement_col, replaced
     )
     return libcudf.replace.replace(
         replaced, to_replace_col, replacement_col
     )
Esempio n. 3
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        import cudf.bindings.copying as cpp_copying
        from cudf.dataframe import columnops

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = columnops.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column")
                key = columnops.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if utils.is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.dataframe.categorical import CategoricalColumn
                from cudf.dataframe.buffer import Buffer
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype="int8")
                fill_value(data, self._encode(value))
                value = CategoricalColumn(
                    data=Buffer(data),
                    categories=self._categories,
                    ordered=False,
                )
            elif value is None:
                value = columnops.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = columnops.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (f"Size mismatch: cannot set value "
                   f"of size {len(value)} to indexing result of size "
                   f"{nelem}")
            raise ValueError(msg)

        if isinstance(key, slice):
            out = cpp_copying.apply_copy_range(self, value, key_start,
                                               key_stop, 0)
        else:
            out = cpp_copying.apply_scatter(value, key, self)

        self._data = out.data
        self._mask = out.mask
        self._update_null_count()
Esempio n. 4
0
    def find_and_replace(
        self,
        to_replace: ColumnLike,
        replacement: ColumnLike,
        all_nan: bool = False,
    ) -> NumericalColumn:
        """
        Return col with *to_replace* replaced with *value*.
        """

        # If all of `to_replace`/`replacement` are `None`,
        # dtype of `to_replace_col`/`replacement_col`
        # is inferred as `string`, but this is a valid
        # float64 column too, Hence we will need to type-cast
        # to self.dtype.
        to_replace_col = column.as_column(to_replace)
        if to_replace_col.null_count == len(to_replace_col):
            to_replace_col = to_replace_col.astype(self.dtype)

        replacement_col = column.as_column(replacement)
        if replacement_col.null_count == len(replacement_col):
            replacement_col = replacement_col.astype(self.dtype)

        if type(to_replace_col) != type(replacement_col):
            raise TypeError(
                f"to_replace and value should be of same types,"
                f"got to_replace dtype: {to_replace_col.dtype} and "
                f"value dtype: {replacement_col.dtype}")

        if not isinstance(to_replace_col, NumericalColumn) and not isinstance(
                replacement_col, NumericalColumn):
            return self.copy()

        to_replace_col = _normalize_find_and_replace_input(
            self.dtype, to_replace)
        if all_nan:
            replacement_col = column.as_column(replacement, dtype=self.dtype)
        else:
            replacement_col = _normalize_find_and_replace_input(
                self.dtype, replacement)
        if len(replacement_col) == 1 and len(to_replace_col) > 1:
            replacement_col = column.as_column(
                utils.scalar_broadcast_to(replacement[0],
                                          (len(to_replace_col), ), self.dtype))
        elif len(replacement_col) == 1 and len(to_replace_col) == 0:
            return self.copy()
        to_replace_col, replacement_col, replaced = numeric_normalize_types(
            to_replace_col, replacement_col, self)
        df = cudf.DataFrame._from_data({
            "old": to_replace_col,
            "new": replacement_col
        })
        df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
        if df._data["old"].null_count == 1:
            replaced = replaced.fillna(
                df._data["new"][df._data["old"].isnull()][0])
            df = df.dropna(subset=["old"])

        return libcudf.replace.replace(replaced, df._data["old"],
                                       df._data["new"])
Esempio n. 5
0
 def normalize_binop_value(self, other):
     if other is None:
         return other
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in {"b", "i", "u", "f"}:
         other_dtype = np.promote_types(self.dtype, other_dtype)
         if other_dtype == np.dtype("float16"):
             other = np.dtype("float32").type(other)
             other_dtype = other.dtype
         if self.dtype.kind == "b":
             other_dtype = min_signed_type(other)
         if np.isscalar(other):
             other = np.dtype(other_dtype).type(other)
             return other
         else:
             ary = utils.scalar_broadcast_to(other,
                                             size=len(self),
                                             dtype=other_dtype)
             return column.build_column(
                 data=Buffer.from_array_lik(ary),
                 dtype=ary.dtype,
                 mask=self.mask,
             )
     else:
         raise TypeError("cannot broadcast {}".format(type(other)))
Esempio n. 6
0
    def replace(self, to_replace, value):
        """
        Replace values given in *to_replace* with *value*.

        Parameters
        ----------
        to_replace : numeric, str or list-like
            Value(s) to replace.

            * numeric or str:

                - values equal to *to_replace* will be replaced with *value*

            * list of numeric or str:

                - If *value* is also list-like, *to_replace* and *value* must
                be of same length.
        value : numeric, str, list-like, or dict
            Value(s) to replace `to_replace` with.

        See also
        --------
        Series.fillna

        Returns
        -------
        result : Series
            Series after replacement. The mask and index are preserved.
        """
        if not is_scalar(to_replace):
            if is_scalar(value):
                value = utils.scalar_broadcast_to(
                    value, (len(to_replace),), np.dtype(type(value))
                )
        else:
            if not is_scalar(value):
                raise TypeError(
                    "Incompatible types '{}' and '{}' "
                    "for *to_replace* and *value*.".format(
                        type(to_replace).__name__, type(value).__name__
                    )
                )
            to_replace = [to_replace]
            value = [value]

        if len(to_replace) != len(value):
            raise ValueError(
                "Replacement lists must be"
                "of same length."
                "Expected {}, got {}.".format(len(to_replace), len(value))
            )

        if is_dict_like(to_replace) or is_dict_like(value):
            raise TypeError("Dict-like args not supported in Series.replace()")

        result = self._column.find_and_replace(to_replace, value)

        return self._copy_construct(data=result)
Esempio n. 7
0
 def normalize_binop_value(self, other):
     ary = utils.scalar_broadcast_to(self._encode(other),
                                     shape=len(self),
                                     dtype=self.data.dtype)
     col = self.replace(data=Buffer(ary),
                        dtype=self.dtype,
                        categories=self._categories,
                        ordered=self._ordered)
     return col
Esempio n. 8
0
def get_str_replacement_series(replacement, bool_mask):
    """
     Get replacement series with replacement at
     Places marked by bool mask and empty other wise
    """
    word_ser = cudf.Series(scalar_broadcast_to("", size=len(bool_mask)))
    word_ser.iloc[bool_mask] = replacement

    return word_ser
Esempio n. 9
0
    def normalize_binop_value(self, other):
        if isinstance(other, dt.datetime):
            other = np.datetime64(other)

        if isinstance(other, pd.Timestamp):
            m = _numpy_to_pandas_conversion[self.time_unit]
            ary = utils.scalar_broadcast_to(other.value * m,
                                            shape=len(self),
                                            dtype=self.dtype)
        elif isinstance(other, np.datetime64):
            other = other.astype(self.dtype)
            ary = utils.scalar_broadcast_to(other,
                                            size=len(self),
                                            dtype=self.dtype)
        else:
            raise TypeError("cannot broadcast {}".format(type(other)))

        return column.build_column(data=Buffer(ary), dtype=self.dtype)
Esempio n. 10
0
 def normalize_binop_value(self, other):
     if isinstance(other, column.Column):
         return other.astype(self.dtype)
     elif isinstance(other, str) or other is None:
         col = utils.scalar_broadcast_to(other,
                                         shape=len(self),
                                         dtype="object")
         return self.replace(data=col.data)
     else:
         raise TypeError('cannot broadcast {}'.format(type(other)))
Esempio n. 11
0
 def normalize_binop_value(self, other):
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in 'biuf':
         other_dtype = np.promote_types(self.dtype, other_dtype)
         ary = utils.scalar_broadcast_to(other,
                                         shape=len(self),
                                         dtype=other_dtype)
         return self.replace(data=Buffer(ary), dtype=ary.dtype)
     else:
         raise TypeError('cannot broadcast {}'.format(type(other)))
Esempio n. 12
0
 def normalize_binop_value(self, other):
     ary = utils.scalar_broadcast_to(self._encode(other),
                                     size=len(self),
                                     dtype=self.codes.dtype)
     col = column.build_categorical_column(
         categories=self.dtype.categories,
         codes=column.as_column(ary),
         mask=self.mask,
         ordered=self.dtype.ordered,
     )
     return col
Esempio n. 13
0
    def normalize_binop_value(self, other):
        if isinstance(other, dt.datetime):
            other = np.datetime64(other)

        if isinstance(other, pd.Timestamp):
            ary = utils.scalar_broadcast_to(other.value *
                                            self._pandas_conversion_factor,
                                            shape=len(self),
                                            dtype=self._npdatetime64_dtype)
        elif isinstance(other, np.datetime64):
            other = other.astype(self._npdatetime64_dtype)
            ary = utils.scalar_broadcast_to(other,
                                            shape=len(self),
                                            dtype=self._npdatetime64_dtype)
        else:
            raise TypeError('cannot broadcast {}'.format(type(other)))

        buf = Buffer(ary)
        result = self.replace(data=buf, dtype=self.dtype)
        return result
Esempio n. 14
0
 def normalize_binop_value(self, other):
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in 'biuf':
         other_dtype = np.promote_types(self.dtype, other_dtype)
         # Temporary workaround since libcudf doesn't support int16 ops
         if other_dtype == np.dtype('int16'):
             other_dtype = np.dtype('int32')
         ary = utils.scalar_broadcast_to(other, shape=len(self),
                                         dtype=other_dtype)
         return self.replace(data=Buffer(ary), dtype=ary.dtype)
     else:
         raise TypeError('cannot broadcast {}'.format(type(other)))
Esempio n. 15
0
    def find_and_replace(
        self,
        to_replace: ColumnLike,
        replacement: ColumnLike,
        all_nan: bool = False,
    ) -> NumericalColumn:
        """
        Return col with *to_replace* replaced with *value*.
        """
        to_replace_col = as_column(to_replace)
        replacement_col = as_column(replacement)

        if type(to_replace_col) != type(replacement_col):
            raise TypeError(
                f"to_replace and value should be of same types,"
                f"got to_replace dtype: {to_replace_col.dtype} and "
                f"value dtype: {replacement_col.dtype}"
            )

        if not isinstance(to_replace_col, NumericalColumn) and not isinstance(
            replacement_col, NumericalColumn
        ):
            return self.copy()

        to_replace_col = _normalize_find_and_replace_input(
            self.dtype, to_replace
        )
        if all_nan:
            replacement_col = column.as_column(replacement, dtype=self.dtype)
        else:
            replacement_col = _normalize_find_and_replace_input(
                self.dtype, replacement
            )
        replaced = self.copy()
        if len(replacement_col) == 1 and len(to_replace_col) > 1:
            replacement_col = column.as_column(
                utils.scalar_broadcast_to(
                    replacement[0], (len(to_replace_col),), self.dtype
                )
            )
        elif len(replacement_col) == 1 and len(to_replace_col) == 0:
            return replaced
        to_replace_col, replacement_col, replaced = numeric_normalize_types(
            to_replace_col, replacement_col, replaced
        )
        return libcudf.replace.replace(
            replaced, to_replace_col, replacement_col
        )
Esempio n. 16
0
    def find_and_replace(
        self,
        to_replace: ColumnLike,
        replacement: ColumnLike,
        all_nan: bool = False,
    ) -> NumericalColumn:
        """
        Return col with *to_replace* replaced with *value*.
        """
        to_replace_col = column.as_column(to_replace)
        replacement_col = column.as_column(replacement)

        if type(to_replace_col) != type(replacement_col):
            raise TypeError(
                f"to_replace and value should be of same types,"
                f"got to_replace dtype: {to_replace_col.dtype} and "
                f"value dtype: {replacement_col.dtype}")

        if not isinstance(to_replace_col, NumericalColumn) and not isinstance(
                replacement_col, NumericalColumn):
            return self.copy()

        to_replace_col = _normalize_find_and_replace_input(
            self.dtype, to_replace)
        if all_nan:
            replacement_col = column.as_column(replacement, dtype=self.dtype)
        else:
            replacement_col = _normalize_find_and_replace_input(
                self.dtype, replacement)
        replaced = self.copy()
        if len(replacement_col) == 1 and len(to_replace_col) > 1:
            replacement_col = column.as_column(
                utils.scalar_broadcast_to(replacement[0],
                                          (len(to_replace_col), ), self.dtype))
        elif len(replacement_col) == 1 and len(to_replace_col) == 0:
            return replaced
        to_replace_col, replacement_col, replaced = numeric_normalize_types(
            to_replace_col, replacement_col, replaced)
        df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
        df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
        if df._data["old"].null_count == 1:
            replaced = replaced.fillna(
                df._data["new"][df._data["old"].isna()][0])
            df = df.dropna(subset=["old"])

        return libcudf.replace.replace(replaced, df["old"]._column,
                                       df["new"]._column)
Esempio n. 17
0
 def normalize_binop_value(self, other):
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in "biuf":
         other_dtype = np.promote_types(self.dtype, other_dtype)
         if other_dtype == np.dtype("float16"):
             other = np.dtype("float32").type(other)
             other_dtype = other.dtype
         if other_dtype.kind in "u":
             other_dtype = min_signed_type(other)
         if np.isscalar(other):
             other = np.dtype(other_dtype).type(other)
             return other
         else:
             ary = utils.scalar_broadcast_to(other,
                                             shape=len(self),
                                             dtype=other_dtype)
             return self.replace(data=Buffer(ary), dtype=ary.dtype)
     else:
         raise TypeError("cannot broadcast {}".format(type(other)))
Esempio n. 18
0
def as_column(arbitrary, nan_as_null=True, dtype=None):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input
        - NumericalColumn for all other inputs.
    """
    from cudf.dataframe import numerical, categorical, datetime, string
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index

    if isinstance(arbitrary, Column):
        categories = None
        if hasattr(arbitrary, "categories"):
            categories = arbitrary.categories
        data = build_column(arbitrary.data,
                            arbitrary.dtype,
                            mask=arbitrary.mask,
                            categories=categories)

    elif isinstance(arbitrary, Series):
        data = arbitrary._column

    elif isinstance(arbitrary, Index):
        data = arbitrary._values

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif isinstance(arbitrary, nvstrings.nvstrings):
        data = string.StringColumn(data=arbitrary)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif cuda.is_cuda_array(arbitrary):
        # Use cuda array interface to do create a numba device array by
        # reference
        new_dev_array = cuda.as_cuda_array(arbitrary)

        # Allocate new output array using rmm and copy the numba device array
        # to an rmm owned device array
        out_dev_array = rmm.device_array_like(new_dev_array)
        out_dev_array.copy_to_device(new_dev_array)

        data = as_column(out_dev_array)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags['C_CONTIGUOUS']:
            arbitrary = np.ascontiguousarray(arbitrary)
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ('O', 'U'):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            count = len(arbitrary)
            null_count = arbitrary.null_count

            buffers = arbitrary.buffers()
            # Buffer of actual strings values
            if buffers[2] is not None:
                sbuf = np.frombuffer(buffers[2], dtype='int8')
            else:
                sbuf = np.empty(0, dtype='int8')
            # Buffer of offsets values
            obuf = np.frombuffer(buffers[1], dtype='int32')
            # Buffer of null bitmask
            nbuf = None
            if null_count > 0:
                nbuf = np.frombuffer(buffers[0], dtype='int8')

            data = as_column(
                nvstrings.from_offsets(sbuf,
                                       obuf,
                                       count,
                                       nbuf=nbuf,
                                       ncount=null_count))
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = dtype
            if (type(dtype) == str and dtype == 'empty') or dtype is None:
                new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())

            if pd.api.types.is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != 'empty':
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = 'category'
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.api.types.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            try:
                pa_type = None
                if dtype is not None:
                    if pd.api.types.is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type)
                data = as_column(pa.array(arbitrary,
                                          type=pa_type,
                                          from_pandas=nan_as_null),
                                 nan_as_null=nan_as_null)
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                np_type = None
                if pd.api.types.is_categorical_dtype(dtype):
                    data = as_column(pd.Series(arbitrary, dtype='category'),
                                     nan_as_null=nan_as_null)
                else:
                    if dtype is None:
                        np_type = None
                    else:
                        np_type = np.dtype(dtype)
                    data = as_column(np.array(arbitrary, dtype=np_type),
                                     nan_as_null=nan_as_null)

    return data
Esempio n. 19
0
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None):
    """Create a Column from an arbitrary object

    Parameters
    ----------
    arbitrary : object
        Object to construct the Column from. See *Notes*.
    nan_as_null : bool,optional
        If True (default), treat NaN values in arbitrary as null.
    dtype : optional
        Optionally typecast the construted Column to the given
        dtype.
    length : int, optional
        If `arbitrary` is a scalar, broadcast into a Column of
        the given length.

    Returns
    -------
    A Column of the appropriate type and size.

    Notes
    -----
    Currently support inputs are:

    * ``Column``
    * ``Series``
    * ``Index``
    * Scalars (can be broadcasted to a specified `length`)
    * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays)
    * Objects exposing ``__array_interface__``(e.g., numpy arrays)
    * pyarrow array
    * pandas.Categorical objects
    """

    from cudf.core.column import numerical, categorical, datetime, string
    from cudf.core.series import Series
    from cudf.core.index import Index

    if isinstance(arbitrary, ColumnBase):
        if dtype is not None:
            return arbitrary.astype(dtype)
        else:
            return arbitrary

    elif isinstance(arbitrary, Series):
        data = arbitrary._column
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Index):
        data = arbitrary._values
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, nvstrings.nvstrings):
        byte_count = arbitrary.byte_count()
        if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES:
            raise MemoryError(
                "Cannot construct string columns "
                "containing > {} bytes. "
                "Consider using dask_cudf to partition "
                "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR)
            )
        sbuf = Buffer.empty(arbitrary.byte_count())
        obuf = Buffer.empty(
            (arbitrary.size() + 1) * np.dtype("int32").itemsize
        )

        nbuf = None
        if arbitrary.null_count() > 0:
            mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize)
            nbuf = Buffer.empty(mask_size)
            arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True)
        arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True)
        children = (
            build_column(obuf, dtype="int32"),
            build_column(sbuf, dtype="int8"),
        )
        data = build_column(
            data=None, dtype="object", mask=nbuf, children=children
        )
        data._nvstrings = arbitrary

    elif isinstance(arbitrary, Buffer):
        if dtype is None:
            raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer")
        data = build_column(arbitrary, dtype=dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype)
        if (
            data.dtype in [np.float16, np.float32, np.float64]
            and arbitrary.size > 0
        ):
            if nan_as_null:
                mask = libcudf.unaryops.nans_to_nulls(data)
                data = data.set_mask(mask)

        elif data.dtype.kind == "M":
            null = column_empty_like(data, masked=True, newsize=1)
            col = libcudf.replace.replace(
                as_column(Buffer(arbitrary), dtype=arbitrary.dtype),
                as_column(
                    Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)),
                    dtype=arbitrary.dtype,
                ),
                null,
            )
            data = datetime.DatetimeColumn(
                data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask
            )

    elif hasattr(arbitrary, "__cuda_array_interface__"):
        desc = arbitrary.__cuda_array_interface__
        data = _data_from_cuda_array_interface_desc(arbitrary)
        mask = _mask_from_cuda_array_interface_desc(arbitrary)
        dtype = np.dtype(desc["typestr"])
        col = build_column(data, dtype=dtype, mask=mask)
        return col

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags["C_CONTIGUOUS"]:
            arbitrary = np.ascontiguousarray(arbitrary)

        if dtype is not None:
            arbitrary = arbitrary.astype(dtype)

        if arbitrary.dtype.kind == "M":
            data = datetime.DatetimeColumn.from_numpy(arbitrary)

        elif arbitrary.dtype.kind in ("O", "U"):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow(
                arbitrary
            )
            children = (
                build_column(data=obuf, dtype="int32"),
                build_column(data=sbuf, dtype="int8"),
            )

            data = string.StringColumn(
                mask=nbuf, children=children, size=pa_size, offset=pa_offset
            )

        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = pd.api.types.pandas_dtype(dtype)
            if (type(dtype) == str and dtype == "empty") or dtype is None:
                new_dtype = pd.api.types.pandas_dtype(
                    arbitrary.type.to_pandas_dtype()
                )

            if is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary),), dtype=new_dtype
                        )
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary),), dtype=new_dtype
                        )
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            codes = as_column(arbitrary.indices)
            if isinstance(arbitrary.dictionary, pa.NullArray):
                categories = as_column([], dtype="object")
            else:
                categories = as_column(arbitrary.dictionary)
            dtype = CategoricalDtype(
                categories=categories, ordered=arbitrary.type.ordered
            )
            data = categorical.CategoricalColumn(
                dtype=dtype,
                mask=codes.base_mask,
                children=(codes,),
                size=codes.size,
                offset=codes.offset,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            dtype = np.dtype("M8[{}]".format(arbitrary.type.unit))
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype
            )

            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date64Array):
            raise NotImplementedError
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype="M8[ms]"
            )
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=np.dtype("M8[ms]"),
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value",
                UserWarning,
            )
            data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]")
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())

            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype
            )
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        else:
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary
            )
            data = numerical.NumericalColumn(
                data=padata,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()),
                mask=pamask,
                size=pa_size,
                offset=pa_offset,
            )

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != "empty":
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = "category"
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = ColumnBase._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        length = length or 1
        data = as_column(
            utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
        )
        if not nan_as_null:
            data = data.fillna(np.nan)

    elif isinstance(arbitrary, memoryview):
        data = as_column(
            np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
        )

    else:
        try:
            data = as_column(
                memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
            )
        except TypeError:
            pa_type = None
            np_type = None
            try:
                if dtype is not None:
                    dtype = pd.api.types.pandas_dtype(dtype)
                    if is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = np_to_pa_dtype(np.dtype(dtype))
                data = as_column(
                    pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null),
                    dtype=dtype,
                    nan_as_null=nan_as_null,
                )
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                if is_categorical_dtype(dtype):
                    sr = pd.Series(arbitrary, dtype="category")
                    data = as_column(sr, nan_as_null=nan_as_null)
                elif np_type == np.str_:
                    sr = pd.Series(arbitrary, dtype="str")
                    data = as_column(sr, nan_as_null=nan_as_null)
                else:
                    data = as_column(
                        np.asarray(arbitrary, dtype=np.dtype(dtype)),
                        nan_as_null=nan_as_null,
                    )
    return data
Esempio n. 20
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        from cudf.core import column

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = column.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column"
                    )
                key = column.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype=self.codes.dtype)
                fill_value(data, self._encode(value))
                value = build_categorical_column(
                    categories=self.dtype.categories,
                    codes=as_column(data),
                    ordered=self.dtype.ordered,
                )
            elif value is None:
                value = column.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = column.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (
                f"Size mismatch: cannot set value "
                f"of size {len(value)} to indexing result of size "
                f"{nelem}"
            )
            raise ValueError(msg)

        if is_categorical_dtype(value.dtype):
            value = value.cat().set_categories(self.categories)
            assert self.dtype == value.dtype

        if isinstance(key, slice):
            out = libcudf.copying.copy_range(
                self, value, key_start, key_stop, 0
            )
        else:
            try:
                out = libcudf.copying.scatter(value, key, self)
            except RuntimeError as e:
                if "out of bounds" in str(e):
                    raise IndexError(
                        f"index out of bounds for column of size {len(self)}"
                    )
                raise

        self._mimic_inplace(out, inplace=True)
Esempio n. 21
0
def as_column(arbitrary, nan_as_null=True, dtype=None, name=None):
    """Create a Column from an arbitrary object
    Currently support inputs are:
    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical
    * Object exposing ``__cuda_array_interface__``
    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input.
        - StringColumn for string input.
        - NumericalColumn for all other inputs.
    """
    from cudf.dataframe import numerical, categorical, datetime, string
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index
    from cudf.bindings.cudf_cpp import np_to_pa_dtype

    if name is None and hasattr(arbitrary, "name"):
        name = arbitrary.name

    if isinstance(arbitrary, Column):
        categories = None
        if hasattr(arbitrary, "categories"):
            categories = arbitrary.categories
        data = build_column(
            arbitrary.data,
            arbitrary.dtype,
            mask=arbitrary.mask,
            categories=categories,
        )

    elif isinstance(arbitrary, Series):
        data = arbitrary._column
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Index):
        data = arbitrary._values
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif isinstance(arbitrary, nvstrings.nvstrings):
        data = string.StringColumn(data=arbitrary)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudf.bindings.utils.mask_from_devary(data)
                data = data.set_mask(mask)

    elif hasattr(arbitrary, "__cuda_array_interface__"):
        from cudf.bindings.cudf_cpp import count_nonzero_mask

        desc = arbitrary.__cuda_array_interface__
        data = _data_from_cuda_array_interface_desc(desc)
        mask = _mask_from_cuda_array_interface_desc(desc)

        if mask is not None:
            nelem = len(data.mem)
            nnz = count_nonzero_mask(mask.mem, size=nelem)
            null_count = nelem - nnz
        else:
            null_count = 0

        return build_column(data,
                            dtype=data.dtype,
                            mask=mask,
                            name=name,
                            null_count=null_count)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags["C_CONTIGUOUS"]:
            arbitrary = np.ascontiguousarray(arbitrary)

        if dtype is not None:
            arbitrary = arbitrary.astype(dtype)

        if arbitrary.dtype.kind == "M":
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ("O", "U"):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            count = len(arbitrary)
            null_count = arbitrary.null_count

            buffers = arbitrary.buffers()
            # Buffer of actual strings values
            if buffers[2] is not None:
                sbuf = np.frombuffer(buffers[2], dtype="int8")
            else:
                sbuf = np.empty(0, dtype="int8")
            # Buffer of offsets values
            obuf = np.frombuffer(buffers[1], dtype="int32")
            # Buffer of null bitmask
            nbuf = None
            if null_count > 0:
                nbuf = np.frombuffer(buffers[0], dtype="int8")

            data = as_column(
                nvstrings.from_offsets(sbuf,
                                       obuf,
                                       count,
                                       nbuf=nbuf,
                                       ncount=null_count))
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = pd.api.types.pandas_dtype(dtype)
            if (type(dtype) == str and dtype == "empty") or dtype is None:
                new_dtype = pd.api.types.pandas_dtype(
                    arbitrary.type.to_pandas_dtype())

            if is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary,
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            dtype = np.dtype("M8[{}]".format(arbitrary.type.unit))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=dtype,
            )
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype="M8[ms]")
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype("M8[ms]"),
            )
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value",
                UserWarning,
            )
            data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]")
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=dtype,
            )
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()),
            )

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != "empty":
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = "category"
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, "dtype"):
            data_type = np_to_pa_dtype(arbitrary.dtype)
            # PyArrow can't construct date64 or date32 arrays from np
            # datetime types
            if pa.types.is_date64(data_type) or pa.types.is_date32(data_type):
                arbitrary = arbitrary.astype("int64")
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary),
                             dtype=dtype,
                             nan_as_null=nan_as_null)
        except TypeError:
            pa_type = None
            np_type = None
            try:
                if dtype is not None:
                    dtype = pd.api.types.pandas_dtype(dtype)
                    if is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = np_to_pa_dtype(np.dtype(dtype))
                data = as_column(
                    pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null),
                    dtype=dtype,
                    nan_as_null=nan_as_null,
                )
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                if is_categorical_dtype(dtype):
                    data = as_column(
                        pd.Series(arbitrary, dtype="category"),
                        nan_as_null=nan_as_null,
                    )
                else:
                    data = as_column(
                        np.array(arbitrary, dtype=np_type),
                        nan_as_null=nan_as_null,
                    )
    if hasattr(data, "name") and (name is not None):
        data.name = name
    return data