Ejemplo n.º 1
0
def test_from_offsets():
    values = np.array([97, 112, 112, 108, 101], dtype=np.int8)
    offsets = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32)
    s = nvstrings.from_offsets(values, offsets, 5)
    expected = ['a', 'p', 'p', 'l', 'e']
    assert_eq(s, expected)

    values = np.array([97, 112, 112, 108, 101, 112, 101, 97, 114],
                      dtype=np.int8)
    offsets = np.array([0, 5, 5, 9], dtype=np.int32)
    s = nvstrings.from_offsets(values, offsets, 3)
    expected = ['apple', '', 'pear']
    assert_eq(s, expected)
Ejemplo n.º 2
0
def test_from_offsets():
    values = np.array([97, 112, 112, 108, 101], dtype=np.int8)
    offsets = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32)
    s = nvstrings.from_offsets(values, offsets, 5)
    expected = ["a", "p", "p", "l", "e"]
    assert_eq(s, expected)

    values = np.array(
        [97, 112, 112, 108, 101, 112, 101, 97, 114], dtype=np.int8
    )
    offsets = np.array([0, 5, 5, 9], dtype=np.int32)
    s = nvstrings.from_offsets(values, offsets, 3)
    expected = ["apple", "", "pear"]
    assert_eq(s, expected)
Ejemplo n.º 3
0
def test_from_offsets_with_bitmask():
    values = np.array([97, 112, 112, 108, 101], dtype=np.int8)
    offsets = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32)
    bitmask = np.array([29], dtype=np.int8)
    s = nvstrings.from_offsets(values, offsets, 5, bitmask, 1)
    expected = ['a', None, 'p', 'l', 'e']
    assert_eq(s, expected)
Ejemplo n.º 4
0
    def deserialize(cls, header, frames):
        # Deserialize the mask, value, and offset frames
        arrays = []

        for i, frame in enumerate(frames):
            if isinstance(frame, memoryview):
                sheader = header["subheaders"][i]
                dtype = sheader["dtype"]
                frame = np.frombuffer(frame, dtype=dtype)
                frame = cudautils.to_device(frame)

            arrays.append(libcudf.cudf.get_ctype_ptr(frame))

        # Use from_offsets to get nvstring data.
        # Note: array items = [nbuf, sbuf, obuf]
        scount = header["nvstrings"]
        data = nvstrings.from_offsets(
            arrays[1],
            arrays[2],
            scount,
            nbuf=arrays[0],
            ncount=header["null_count"],
            bdevmem=True,
        )
        typ = pickle.loads(header["type"])
        return typ(data)
Ejemplo n.º 5
0
    def deserialize(cls, header, frames):
        # Deserialize the mask, value, and offset frames
        arrays = []

        for each_frame in frames:
            if hasattr(each_frame, "__cuda_array_interface__"):
                each_frame = cuda.as_cuda_array(each_frame)
            elif isinstance(each_frame, memoryview):
                each_frame = np.asarray(each_frame)
                each_frame = cudautils.to_device(each_frame)

            arrays.append(libcudf.cudf.get_ctype_ptr(each_frame))

        # Use from_offsets to get nvstring data.
        # Note: array items = [nbuf, sbuf, obuf]
        scount = header["nvstrings"]
        data = nvstrings.from_offsets(
            arrays[1],
            arrays[2],
            scount,
            nbuf=arrays[0],
            ncount=header["null_count"],
            bdevmem=True,
        )
        return column.as_column(data)
Ejemplo n.º 6
0
def test_from_offsets_ctypes_data():
    values = np.array([97, 112, 112, 108, 101, 112, 101, 97, 114],
                      dtype=np.int8)
    offsets = np.array([0, 5, 5, 9], dtype=np.int32)
    bitmask = np.array([5], dtype=np.int8)
    s = nvstrings.from_offsets(values.ctypes.data, offsets.ctypes.data, 3,
                               bitmask.ctypes.data, 1)
    expected = ['apple', None, 'pear']
    assert_eq(s, expected)
Ejemplo n.º 7
0
def array_to_series(array):

    if isinstance(array, pa.ChunkedArray):
        return Series._concat(
            [array_to_series(chunk) for chunk in array.chunks]
        )
    if isinstance(array, pa.Column):
        return Series._concat(
            [array_to_series(chunk) for chunk in array.data.chunks]
        )

    array_len = len(array)
    null_count = array.null_count
    buffers = make_device_arrays(array)
    mask, data = buffers[0], buffers[1]
    dtype = arrow_to_pandas_dtype(array.type)

    if pa.types.is_dictionary(array.type):
        from cudf.core.column import CategoricalColumn

        codes = array_to_series(array.indices)
        categories = array_to_series(array.dictionary)
        data = CategoricalColumn(
            data=codes.data,
            mask=mask,
            null_count=null_count,
            categories=categories,
            ordered=array.type.ordered,
        )
    elif pa.types.is_string(array.type):
        import nvstrings

        offs, data = buffers[1], buffers[2]
        offs = offs[array.offset : array.offset + array_len + 1]
        data = None if data is None else data.device_ctypes_pointer.value
        mask = None if mask is None else mask.device_ctypes_pointer.value
        data = nvstrings.from_offsets(
            data,
            offs.device_ctypes_pointer.value,
            array_len,
            mask,
            null_count,
            True,
        )
    elif data is not None:
        data = data[array.offset : array.offset + len(array)]

    series = Series(data, dtype=dtype)

    if null_count > 0 and mask is not None and not series.has_null_mask:
        return series.set_mask(mask, null_count)

    return series
Ejemplo n.º 8
0
def test_from_offsets_dev_data():
    values = np.array([97, 112, 112, 108, 101, 112, 101, 97, 114],
                      dtype=np.int8)
    offsets = np.array([0, 5, 5, 9], dtype=np.int32)
    bitmask = np.array([5], dtype=np.int8)
    values = rmm.to_device(values)
    offsets = rmm.to_device(offsets)
    bitmask = rmm.to_device(bitmask)
    s = nvstrings.from_offsets(values.device_ctypes_pointer.value,
                               offsets.device_ctypes_pointer.value, 3,
                               bitmask.device_ctypes_pointer.value, 1, True)
    expected = ['apple', None, 'pear']
    assert_eq(s, expected)
Ejemplo n.º 9
0
 def nvstrings(self):
     if self._nvstrings is None:
         if self.nullable:
             mask_ptr = self.mask.ptr
         else:
             mask_ptr = None
         if self.size == 0:
             self._nvstrings = nvstrings.to_device([])
         else:
             self._nvstrings = nvstrings.from_offsets(
                 self.children[1].data.ptr,
                 self.children[0].data.ptr,
                 self.size,
                 mask_ptr,
                 ncount=self.null_count,
                 bdevmem=True,
             )
     return self._nvstrings
Ejemplo n.º 10
0
    def deserialize(cls, header, frames):
        # Deserialize the mask, value, and offset frames
        arrays = []

        for i, frame in enumerate(frames):
            if isinstance(frame, memoryview):
                sheader = header["subheaders"][i]
                dtype = sheader["dtype"]
                frame = np.frombuffer(frame, dtype=dtype)
                frame = cudautils.to_device(frame)
            elif not (isinstance(frame, np.ndarray)
                      or numba.cuda.driver.is_device_memory(frame)):
                # this is probably a ucp_py.BufferRegion memory object
                # check the header for info -- this should be encoded from
                # serialization process.  Lastly, `typestr` and `shape` *must*
                # manually set *before* consuming the buffer as a DeviceNDArray
                sheader = header["subheaders"][i]
                frame.typestr = sheader.get("dtype", "B")
                frame.shape = sheader.get("shape", len(frame))
                frame = np.frombuffer(frame, dtype=dtype)
                frame = cudautils.to_device(frame)

            arrays.append(get_ctype_ptr(frame))

        # Use from_offsets to get nvstring data.
        # Note: array items = [nbuf, sbuf, obuf]
        scount = header["nvstrings"]
        data = nvstrings.from_offsets(
            arrays[1],
            arrays[2],
            scount,
            nbuf=arrays[0],
            ncount=header["null_count"],
            bdevmem=True,
        )
        return data
Ejemplo n.º 11
0
def as_column(arbitrary, nan_as_null=True, dtype=None):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input
        - NumericalColumn for all other inputs.
    """
    from cudf.dataframe import numerical, categorical, datetime, string
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index

    if isinstance(arbitrary, Column):
        categories = None
        if hasattr(arbitrary, "categories"):
            categories = arbitrary.categories
        data = build_column(arbitrary.data,
                            arbitrary.dtype,
                            mask=arbitrary.mask,
                            categories=categories)

    elif isinstance(arbitrary, Series):
        data = arbitrary._column

    elif isinstance(arbitrary, Index):
        data = arbitrary._values

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif isinstance(arbitrary, nvstrings.nvstrings):
        data = string.StringColumn(data=arbitrary)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif cuda.is_cuda_array(arbitrary):
        # Use cuda array interface to do create a numba device array by
        # reference
        new_dev_array = cuda.as_cuda_array(arbitrary)

        # Allocate new output array using rmm and copy the numba device array
        # to an rmm owned device array
        out_dev_array = rmm.device_array_like(new_dev_array)
        out_dev_array.copy_to_device(new_dev_array)

        data = as_column(out_dev_array)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags['C_CONTIGUOUS']:
            arbitrary = np.ascontiguousarray(arbitrary)
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ('O', 'U'):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            count = len(arbitrary)
            null_count = arbitrary.null_count

            buffers = arbitrary.buffers()
            # Buffer of actual strings values
            if buffers[2] is not None:
                sbuf = np.frombuffer(buffers[2], dtype='int8')
            else:
                sbuf = np.empty(0, dtype='int8')
            # Buffer of offsets values
            obuf = np.frombuffer(buffers[1], dtype='int32')
            # Buffer of null bitmask
            nbuf = None
            if null_count > 0:
                nbuf = np.frombuffer(buffers[0], dtype='int8')

            data = as_column(
                nvstrings.from_offsets(sbuf,
                                       obuf,
                                       count,
                                       nbuf=nbuf,
                                       ncount=null_count))
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = dtype
            if (type(dtype) == str and dtype == 'empty') or dtype is None:
                new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())

            if pd.api.types.is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != 'empty':
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = 'category'
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.api.types.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            try:
                pa_type = None
                if dtype is not None:
                    if pd.api.types.is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type)
                data = as_column(pa.array(arbitrary,
                                          type=pa_type,
                                          from_pandas=nan_as_null),
                                 nan_as_null=nan_as_null)
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                np_type = None
                if pd.api.types.is_categorical_dtype(dtype):
                    data = as_column(pd.Series(arbitrary, dtype='category'),
                                     nan_as_null=nan_as_null)
                else:
                    if dtype is None:
                        np_type = None
                    else:
                        np_type = np.dtype(dtype)
                    data = as_column(np.array(arbitrary, dtype=np_type),
                                     nan_as_null=nan_as_null)

    return data
Ejemplo n.º 12
0
import nvstrings
import numpy as np
values = np.array([97, 112, 112, 108, 101], dtype=np.int8)
print("values", values.tobytes())
offsets = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32)
print("offsets", offsets)
s = nvstrings.from_offsets(values, offsets, 5)
print(s)

bitmask = np.array([29], dtype=np.int8)
print("bitmask", bitmask.tobytes())
s = nvstrings.from_offsets(values, offsets, 5, bitmask, 1)
print(s)

print("------------------")
values = np.array([97, 112, 112, 108, 101, 112, 101, 97, 114], dtype=np.int8)
print("values", values.tobytes())
offsets = np.array([0, 5, 5, 9], dtype=np.int32)
print("offsets", offsets)
s = nvstrings.from_offsets(values, offsets, 3)
print(s)

bitmask = np.array([5], dtype=np.int8)
print("bitmask", bitmask.tobytes())
s = nvstrings.from_offsets(values, offsets, 3, bitmask, 1)
print(s)

print("values.ctypes.data", hex(values.ctypes.data))
print("offsets.ctypes.data", hex(offsets.ctypes.data))
print("bitmask.ctypes.data", hex(bitmask.ctypes.data))
s = nvstrings.from_offsets(values.ctypes.data, offsets.ctypes.data, 3,
Ejemplo n.º 13
0
def as_column(arbitrary, nan_as_null=True, dtype=None, name=None):
    """Create a Column from an arbitrary object
    Currently support inputs are:
    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical
    * Object exposing ``__cuda_array_interface__``
    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input.
        - StringColumn for string input.
        - NumericalColumn for all other inputs.
    """
    from cudf.dataframe import numerical, categorical, datetime, string
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index
    from cudf.bindings.cudf_cpp import np_to_pa_dtype

    if name is None and hasattr(arbitrary, "name"):
        name = arbitrary.name

    if isinstance(arbitrary, Column):
        categories = None
        if hasattr(arbitrary, "categories"):
            categories = arbitrary.categories
        data = build_column(
            arbitrary.data,
            arbitrary.dtype,
            mask=arbitrary.mask,
            categories=categories,
        )

    elif isinstance(arbitrary, Series):
        data = arbitrary._column
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Index):
        data = arbitrary._values
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif isinstance(arbitrary, nvstrings.nvstrings):
        data = string.StringColumn(data=arbitrary)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudf.bindings.utils.mask_from_devary(data)
                data = data.set_mask(mask)

    elif hasattr(arbitrary, "__cuda_array_interface__"):
        from cudf.bindings.cudf_cpp import count_nonzero_mask

        desc = arbitrary.__cuda_array_interface__
        data = _data_from_cuda_array_interface_desc(desc)
        mask = _mask_from_cuda_array_interface_desc(desc)

        if mask is not None:
            nelem = len(data.mem)
            nnz = count_nonzero_mask(mask.mem, size=nelem)
            null_count = nelem - nnz
        else:
            null_count = 0

        return build_column(data,
                            dtype=data.dtype,
                            mask=mask,
                            name=name,
                            null_count=null_count)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags["C_CONTIGUOUS"]:
            arbitrary = np.ascontiguousarray(arbitrary)

        if dtype is not None:
            arbitrary = arbitrary.astype(dtype)

        if arbitrary.dtype.kind == "M":
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ("O", "U"):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            count = len(arbitrary)
            null_count = arbitrary.null_count

            buffers = arbitrary.buffers()
            # Buffer of actual strings values
            if buffers[2] is not None:
                sbuf = np.frombuffer(buffers[2], dtype="int8")
            else:
                sbuf = np.empty(0, dtype="int8")
            # Buffer of offsets values
            obuf = np.frombuffer(buffers[1], dtype="int32")
            # Buffer of null bitmask
            nbuf = None
            if null_count > 0:
                nbuf = np.frombuffer(buffers[0], dtype="int8")

            data = as_column(
                nvstrings.from_offsets(sbuf,
                                       obuf,
                                       count,
                                       nbuf=nbuf,
                                       ncount=null_count))
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = pd.api.types.pandas_dtype(dtype)
            if (type(dtype) == str and dtype == "empty") or dtype is None:
                new_dtype = pd.api.types.pandas_dtype(
                    arbitrary.type.to_pandas_dtype())

            if is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary,
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            dtype = np.dtype("M8[{}]".format(arbitrary.type.unit))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=dtype,
            )
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype="M8[ms]")
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype("M8[ms]"),
            )
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value",
                UserWarning,
            )
            data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]")
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=dtype,
            )
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()),
            )

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != "empty":
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = "category"
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, "dtype"):
            data_type = np_to_pa_dtype(arbitrary.dtype)
            # PyArrow can't construct date64 or date32 arrays from np
            # datetime types
            if pa.types.is_date64(data_type) or pa.types.is_date32(data_type):
                arbitrary = arbitrary.astype("int64")
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary),
                             dtype=dtype,
                             nan_as_null=nan_as_null)
        except TypeError:
            pa_type = None
            np_type = None
            try:
                if dtype is not None:
                    dtype = pd.api.types.pandas_dtype(dtype)
                    if is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = np_to_pa_dtype(np.dtype(dtype))
                data = as_column(
                    pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null),
                    dtype=dtype,
                    nan_as_null=nan_as_null,
                )
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                if is_categorical_dtype(dtype):
                    data = as_column(
                        pd.Series(arbitrary, dtype="category"),
                        nan_as_null=nan_as_null,
                    )
                else:
                    data = as_column(
                        np.array(arbitrary, dtype=np_type),
                        nan_as_null=nan_as_null,
                    )
    if hasattr(data, "name") and (name is not None):
        data.name = name
    return data