Esempio n. 1
0
File: _gdf.py Progetto: cuulee/cudf
def quantile(column, quant, method, exact):
    """ Calculate the `quant` quantile for the column
    Returns value with the quantile specified by quant
    """
    gdf_context = ffi.new('gdf_context*')
    method_api = _join_method_api['sort']
    libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0)
    # libgdf.gdf_context_view(gdf_context, 0, method_api, 0)
    # px = ffi.new("double *")
    res = []
    for q in quant:
        px = ffi.new("double *")
        if exact:
            libgdf.gdf_quantile_exact(column.cffi_view,
                                      get_quantile_method(method),
                                      q,
                                      ffi.cast('void *', px),
                                      gdf_context)
        else:
            libgdf.gdf_quantile_aprrox(column.cffi_view,
                                       q,
                                       ffi.cast('void *', px),
                                       gdf_context)
        res.append(px[0])
    return res
Esempio n. 2
0
def apply_join(col_lhs, col_rhs, how, method='hash'):
    """Returns a tuple of the left and right joined indices as gpu arrays.
    """
    if (len(col_lhs) != len(col_rhs)):
        msg = "Unequal #columns in list 'col_lhs' and list 'col_rhs'"
        raise ValueError(msg)

    joiner = _join_how_api[how]
    method_api = _join_method_api[method]
    gdf_context = ffi.new('gdf_context*')

    if method == 'hash':
        libgdf.gdf_context_view(gdf_context, 0, method_api, 0)
    elif method == 'sort':
        libgdf.gdf_context_view(gdf_context, 1, method_api, 0)
    else:
        msg = "method not supported"
        raise ValueError(msg)

    col_result_l = columnview(0, None, dtype=np.int32)
    col_result_r = columnview(0, None, dtype=np.int32)

    if (how in ['left', 'inner']):
        list_lhs = []
        list_rhs = []
        for i in range(len(col_lhs)):
            list_lhs.append(col_lhs[i].cffi_view)
            list_rhs.append(col_rhs[i].cffi_view)

        # Call libgdf

        joiner(len(col_lhs), list_lhs, list_rhs, col_result_l, col_result_r,
               gdf_context)
    else:
        joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, col_result_l,
               col_result_r)

    # Extract result

    # yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary))

    left = _as_numba_devarray(intaddr=int(
        ffi.cast("uintptr_t", col_result_l.data)),
                              nelem=col_result_l.size,
                              dtype=np.int32)

    right = _as_numba_devarray(intaddr=int(
        ffi.cast("uintptr_t", col_result_r.data)),
                               nelem=col_result_r.size,
                               dtype=np.int32)

    yield (left, right)

    libgdf.gdf_column_free(col_result_l)
    libgdf.gdf_column_free(col_result_r)
Esempio n. 3
0
    def _parse_metdata(self):
        "Parse the metadata in the IPC handle"
        from libgdf_cffi import ffi, libgdf

        @contextmanager
        def open_parser(schema_ptr, schema_len):
            "context to destroy the parser"
            _logger.debug('open IPCParser')
            ipcparser = libgdf.gdf_ipc_parser_open(schema_ptr, schema_len)
            yield ipcparser
            _logger.debug('close IPCParser')
            libgdf.gdf_ipc_parser_close(ipcparser)

        def check_error(ipcparser):
            if libgdf.gdf_ipc_parser_failed(ipcparser):
                raw_error = libgdf.gdf_ipc_parser_get_error(ipcparser)
                error = ffi.string(raw_error).decode()
                _logger.error('IPCParser failed: %s', error)
                raise MetadataParsingError(error)

        def load_json(jsonraw):
            jsontext = ffi.string(jsonraw).decode()
            return json.loads(jsontext)

        # get void* from the gpu array
        schema_ptr = ffi.cast("void*", self._schema_data.ctypes.data)

        # parse schema
        with open_parser(schema_ptr, len(self._schema_data)) as ipcparser:
            # check for failure in parseing the schema
            check_error(ipcparser)

            gpu_addr = self._gpu_data.device_ctypes_pointer.value
            gpu_ptr = ffi.cast("void*", gpu_addr)
            libgdf.gdf_ipc_parser_open_recordbatches(ipcparser, gpu_ptr,
                                                     self._gpu_data.size)
            # check for failure in parsing the recordbatches
            check_error(ipcparser)
            # get schema as json
            _logger.debug('IPCParser get metadata as json')
            schemadct = load_json(
                libgdf.gdf_ipc_parser_get_schema_json(ipcparser))
            layoutdct = load_json(
                libgdf.gdf_ipc_parser_get_layout_json(ipcparser))

            # get data offset
            _logger.debug('IPCParser data region offset')
            dataoffset = libgdf.gdf_ipc_parser_get_data_offset(ipcparser)
            dataoffset = int(ffi.cast('uint64_t', dataoffset))
            dataptr = self._gpu_data[dataoffset:]

        return schemadct, layoutdct, dataptr
Esempio n. 4
0
    def _parse_metdata(self):
        "Parse the metadata in the IPC handle"
        from libgdf_cffi import ffi, libgdf

        @contextmanager
        def open_parser(schema_ptr, schema_len):
            "context to destroy the parser"
            _logger.debug('open IPCParser')
            ipcparser = libgdf.gdf_ipc_parser_open(schema_ptr, schema_len)
            yield ipcparser
            _logger.debug('close IPCParser')
            libgdf.gdf_ipc_parser_close(ipcparser)

        def check_error(ipcparser):
            if libgdf.gdf_ipc_parser_failed(ipcparser):
                raw_error = libgdf.gdf_ipc_parser_get_error(ipcparser)
                error = ffi.string(raw_error).decode()
                _logger.error('IPCParser failed: %s', error)
                raise MetadataParsingError(error)

        def load_json(jsonraw):
            jsontext = ffi.string(jsonraw).decode()
            return json.loads(jsontext)

        # get void* from the gpu array
        schema_ptr = ffi.cast("void*", self._schema_data.ctypes.data)

        # parse schema
        with open_parser(schema_ptr, len(self._schema_data)) as ipcparser:
            # check for failure in parseing the schema
            check_error(ipcparser)

            gpu_addr = self._gpu_data.device_ctypes_pointer.value
            gpu_ptr = ffi.cast("void*", gpu_addr)
            libgdf.gdf_ipc_parser_open_recordbatches(ipcparser, gpu_ptr,
                                                     self._gpu_data.size)
            # check for failure in parsing the recordbatches
            check_error(ipcparser)
            # get schema as json
            _logger.debug('IPCParser get metadata as json')
            schemadct = load_json(
                libgdf.gdf_ipc_parser_get_schema_json(ipcparser))
            layoutdct = load_json(
                libgdf.gdf_ipc_parser_get_layout_json(ipcparser))

            # get data offset
            _logger.debug('IPCParser data region offset')
            dataoffset = libgdf.gdf_ipc_parser_get_data_offset(ipcparser)
            dataoffset = int(ffi.cast('uint64_t', dataoffset))
            dataptr = self._gpu_data[dataoffset:]

        return schemadct, layoutdct, dataptr
Esempio n. 5
0
def _call_hash_multi(api, ncols, col_input, magic, nrows):
    out_ary = np.zeros(nrows, dtype=np.int32)
    d_out = cuda.to_device(out_ary)
    col_out = new_column()
    libgdf.gdf_column_view(col_out, unwrap_devary(d_out), ffi.NULL,
                           out_ary.size, get_dtype(d_out.dtype))

    api(ncols, col_input, magic, col_out)

    dataptr = col_out.data
    print(dataptr)
    datasize = col_out.size
    print(datasize)

    addr = ctypes.c_uint64(int(ffi.cast("uintptr_t", dataptr)))
    print(hex(addr.value))
    memptr = cuda.driver.MemoryPointer(context=cuda.current_context(),
                                       pointer=addr,
                                       size=4 * datasize)
    print(memptr)
    ary = cuda.devicearray.DeviceNDArray(shape=(datasize, ),
                                         strides=(4, ),
                                         dtype=np.dtype(np.int32),
                                         gpu_data=memptr)

    hashed_result = ary.copy_to_host()
    print(hashed_result)

    return hashed_result
Esempio n. 6
0
def apply_join(col_lhs, col_rhs, how):
    """Returns a tuple of the left and right joined indices as gpu arrays.
    """
    if (len(col_lhs) != len(col_rhs)):
        msg = "Unequal #columns in list 'col_lhs' and list 'col_rhs'"
        raise ValueError(msg)

    joiner = _join_how_api[how]
    join_result_ptr = ffi.new("gdf_join_result_type**", None)

    if (how == 'left'):
        list_lhs = []
        list_rhs = []
        for i in range(len(col_lhs)):
            list_lhs.append(col_lhs[i].cffi_view)
            list_rhs.append(col_rhs[i].cffi_view)

        # Call libgdf
        joiner(len(col_lhs), list_lhs, list_rhs, join_result_ptr)
    else:
        joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, join_result_ptr)

    # Extract result
    join_result = join_result_ptr[0]
    dataptr = libgdf.gdf_join_result_data(join_result)
    datasize = libgdf.gdf_join_result_size(join_result)
    ary = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", dataptr)),
                             nelem=datasize,
                             dtype=np.int32)
    ary = ary.reshape(2, datasize // 2)
    yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary))
    libgdf.gdf_join_result_free(join_result)
Esempio n. 7
0
def _call_join_multi(api, ncols, col_left, col_right, ctxt):
    join_result_ptr = ffi.new("gdf_join_result_type**", None)

    api(ncols, col_left, col_right, join_result_ptr, ctxt)
    join_result = join_result_ptr[0]
    print('join_result', join_result)

    dataptr = libgdf.gdf_join_result_data(join_result)
    print(dataptr)
    datasize = libgdf.gdf_join_result_size(join_result)
    print(datasize)

    addr = ctypes.c_uint64(int(ffi.cast("uintptr_t", dataptr)))
    print(hex(addr.value))
    memptr = cuda.driver.MemoryPointer(context=cuda.current_context(),
                                       pointer=addr,
                                       size=4 * datasize)
    print(memptr)
    ary = cuda.devicearray.DeviceNDArray(shape=(datasize, ),
                                         strides=(4, ),
                                         dtype=np.dtype(np.int32),
                                         gpu_data=memptr)

    joined_idx = ary.reshape(2, datasize // 2).copy_to_host()
    print(joined_idx)

    libgdf.gdf_join_result_free(join_result)
    return joined_idx
Esempio n. 8
0
def cffi_view_to_column_mem(cffi_view):
    data = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t",
                                                   cffi_view.data)),
                              nelem=cffi_view.size,
                              dtype=gdf_to_np_dtype(cffi_view.dtype),
                              cb_dtor=cuda.driver.driver.cuMemFree)

    if cffi_view.valid:
        mask = _as_numba_devarray(
            intaddr=int(ffi.cast("uintptr_t", cffi_view.valid)),
            nelem=calc_chunk_size(cffi_view.size, mask_bitsize),
            dtype=mask_dtype,
            cb_dtor=cuda.driver.driver.cuMemFree)
    else:
        mask = None

    return data, mask
Esempio n. 9
0
def cffi_view_to_column_mem(cffi_view):
    intaddr = int(ffi.cast("uintptr_t", cffi_view.data))
    data = rmm.device_array_from_ptr(intaddr,
                                     nelem=cffi_view.size,
                                     dtype=gdf_to_np_dtype(cffi_view.dtype),
                                     finalizer=rmm._make_finalizer(intaddr, 0))

    if cffi_view.valid:
        intaddr = int(ffi.cast("uintptr_t", cffi_view.valid))
        mask = rmm.device_array_from_ptr(
            intaddr,
            nelem=calc_chunk_size(cffi_view.size, mask_bitsize),
            dtype=mask_dtype,
            finalizer=rmm._make_finalizer(intaddr, 0))
    else:
        mask = None

    return data, mask
Esempio n. 10
0
def _copy_int_col_to_arr(col):
    dataptr = col.data
    datasize = col.size
    addr = ctypes.c_uint64(int(ffi.cast("uintptr_t", dataptr)))
    memptr = cuda.driver.MemoryPointer(context=cuda.current_context(),
                                       pointer=addr,
                                       size=4 * datasize)
    ary = cuda.devicearray.DeviceNDArray(shape=(datasize, ),
                                         strides=(4, ),
                                         dtype=np.dtype(np.int32),
                                         gpu_data=memptr)
    return ary.copy_to_host()
Esempio n. 11
0
def cffi_view_to_column_mem(cffi_view):
    gdf_dtype = cffi_view.dtype
    if gdf_dtype == libgdf.GDF_STRING_CATEGORY:
        data_ptr = int(ffi.cast("uintptr_t", cffi_view.data))
        # We need to create this just to make sure the memory is properly freed
        data = rmm.device_array_from_ptr(data_ptr,
                                         nelem=cffi_view.size,
                                         dtype='int32',
                                         finalizer=rmm._make_finalizer(
                                             data_ptr, 0))
        nvcat_ptr = int(ffi.cast("uintptr_t", cffi_view.dtype_info.category))
        nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
        nvstr_obj = nvcat_obj.to_strings()
        mask = None
        if cffi_view.valid:
            mask_ptr = int(ffi.cast("uintptr_t", cffi_view.valid))
            mask = rmm.device_array_from_ptr(
                mask_ptr,
                nelem=calc_chunk_size(cffi_view.size, mask_bitsize),
                dtype=mask_dtype,
                finalizer=rmm._make_finalizer(mask_ptr, 0))
        return nvstr_obj, mask
    else:
        intaddr = int(ffi.cast("uintptr_t", cffi_view.data))
        data = rmm.device_array_from_ptr(
            intaddr,
            nelem=cffi_view.size,
            dtype=gdf_to_np_dtype(cffi_view.dtype),
            finalizer=rmm._make_finalizer(intaddr, 0))
        mask = None
        if cffi_view.valid:
            intaddr = int(ffi.cast("uintptr_t", cffi_view.valid))
            mask = rmm.device_array_from_ptr(
                intaddr,
                nelem=calc_chunk_size(cffi_view.size, mask_bitsize),
                dtype=mask_dtype,
                finalizer=rmm._make_finalizer(intaddr, 0))

        return data, mask
Esempio n. 12
0
    def _parse_metdata(self):
        "Parse the metadata in the IPC handle"
        from libgdf_cffi import ffi, libgdf

        @contextmanager
        def open_parser(devptr):
            "context to destroy the parser"
            _logger.debug('open IPCParser')
            ipcparser = libgdf.gdf_ipc_parser_open(devptr)
            yield ipcparser
            _logger.debug('close IPCParser')
            libgdf.gdf_ipc_parser_close(ipcparser)

        # get void* from the gpu array
        devptr = ffi.cast("void*", self._gpu_data.device_ctypes_pointer.value)

        # parse
        with open_parser(devptr) as ipcparser:
            # check for failure
            if libgdf.gdf_ipc_parser_failed(ipcparser):
                raw_error = libgdf.gdf_ipc_parser_get_error(ipcparser)
                error = ffi.string(raw_error).decode()
                _logger.error('IPCParser failed: %s', error)
                raise MetadataParsingError(error)

            # get schema as json
            _logger.debug('IPCParser get metadata as json')
            jsonraw = libgdf.gdf_ipc_parser_to_json(ipcparser)
            jsontext = ffi.string(jsonraw).decode()
            outdct = json.loads(jsontext)

            # get data offset
            _logger.debug('IPCParser data region offset')
            dataoffset = libgdf.gdf_ipc_parser_get_data_offset(ipcparser)
            dataoffset = int(ffi.cast('uint64_t', dataoffset))
            dataptr = self._gpu_data[dataoffset:]

        return outdct, dataptr
Esempio n. 13
0
def apply_join(col_lhs, col_rhs, how):
    """Returns a tuple of the left and right joined indices as gpu arrays.
    """
    joiner = _join_how_api[how]
    join_result_ptr = ffi.new("gdf_join_result_type**", None)
    # Call libgdf
    joiner(col_lhs.cffi_view, col_rhs.cffi_view, join_result_ptr)
    # Extract result
    join_result = join_result_ptr[0]
    dataptr = libgdf.gdf_join_result_data(join_result)
    datasize = libgdf.gdf_join_result_size(join_result)
    ary = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", dataptr)),
                             nelem=datasize, dtype=np.int32)
    ary = ary.reshape(2, datasize // 2)
    yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary))
    libgdf.gdf_join_result_free(join_result)
Esempio n. 14
0
def apply_join(col_lhs, col_rhs, how):
    """Returns a tuple of the left and right joined indices as gpu arrays.
    """
    joiner = _join_how_api[how]
    join_result_ptr = ffi.new("gdf_join_result_type**", None)
    # Call libgdf
    joiner(col_lhs.cffi_view, col_rhs.cffi_view, join_result_ptr)
    # Extract result
    join_result = join_result_ptr[0]
    dataptr = libgdf.gdf_join_result_data(join_result)
    datasize = libgdf.gdf_join_result_size(join_result)
    ary = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", dataptr)),
                             nelem=datasize, dtype=np.int32)
    ary = ary.reshape(2, datasize // 2)
    yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary))
    libgdf.gdf_join_result_free(join_result)
Esempio n. 15
0
File: _gdf.py Progetto: cuulee/cudf
def nvtx_range_push(name, color='green'):
    """
    Demarcate the beginning of a user-defined NVTX range.

    Parameters
    ----------
    name : str
        The name of the NVTX range
    color : str
        The color to use for the range.
        Can be named color or hex RGB string.
    """
    name_c = ffi.new("char[]", name.encode('ascii'))

    try:
        color = int(color, 16)  # only works if color is a hex string
        libgdf.gdf_nvtx_range_push_hex(name_c, ffi.cast('unsigned int', color))
    except ValueError:
        color = str_to_gdf_color(color)
        libgdf.gdf_nvtx_range_push(name_c, color)
Esempio n. 16
0
def _columnview(size, data, mask, dtype, null_count, nvcat):
    colview = ffi.new('gdf_column*')
    extra_dtype_info = ffi.new('gdf_dtype_extra_info*')
    extra_dtype_info.time_unit = libgdf.TIME_UNIT_NONE
    if nvcat is not None:
        extra_dtype_info.category = ffi.cast('void*', nvcat.get_cpointer())
    else:
        extra_dtype_info.category = ffi.NULL

    if mask is None:
        null_count = 0
        mask = ffi.NULL

    libgdf.gdf_column_view_augmented(
        colview,
        data,
        mask,
        size,
        np_to_gdf_dtype(dtype),
        null_count,
        extra_dtype_info[0],
    )

    return colview
Esempio n. 17
0
def test_ipc():
    schema_bytes = b'\xa8\x01\x00\x00\x10\x00\x00\x00\x0c\x00\x0e\x00\x06\x00\x05\x00\x08\x00\x00\x00\x0c\x00\x00\x00\x00\x01\x02\x00\x10\x00\x00\x00\x00\x00\n\x00\x08\x00\x00\x00\x04\x00\x00\x00\n\x00\x00\x00\x04\x00\x00\x00\x03\x00\x00\x00\x18\x01\x00\x00p\x00\x00\x00\x04\x00\x00\x00\x08\xff\xff\xff\x00\x00\x01\x03@\x00\x00\x00$\x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00$\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x08\x00\x06\x00\x06\x00\x00\x00\x00\x00\x02\x00\xe8\xfe\xff\xff@\x00\x01\x00\xf0\xfe\xff\xff\x01\x00\x02\x00\x06\x00\x00\x00weight\x00\x00\x14\x00\x1e\x00\x08\x00\x06\x00\x07\x00\x0c\x00\x10\x00\x14\x00\x18\x00\x00\x00\x14\x00\x00\x00\x00\x00\x01\x05|\x00\x00\x00T\x00\x00\x00\x18\x00\x00\x00D\x00\x00\x000\x00\x00\x00\x00\x00\n\x00\x14\x00\x08\x00\x04\x00\x00\x00\n\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00p\xff\xff\xff\x00\x00\x00\x01 \x00\x00\x00\x03\x00\x00\x000\x00\x00\x00$\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x04\x00\x04\x00\x04\x00\x00\x00|\xff\xff\xff\x08\x00\x01\x00\x08\x00\x08\x00\x06\x00\x00\x00\x08\x00\x00\x00\x00\x00 \x00\x94\xff\xff\xff\x01\x00\x02\x00\x04\x00\x00\x00name\x00\x00\x00\x00\x14\x00\x18\x00\x08\x00\x06\x00\x07\x00\x0c\x00\x00\x00\x10\x00\x14\x00\x00\x00\x14\x00\x00\x00\x00\x00\x01\x02L\x00\x00\x00$\x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x000\x00\x00\x00\x1c\x00\x00\x00\x00\x00\x00\x00\x08\x00\x0c\x00\x08\x00\x07\x00\x08\x00\x00\x00\x00\x00\x00\x01 \x00\x00\x00\xf8\xff\xff\xff \x00\x01\x00\x08\x00\x08\x00\x04\x00\x06\x00\x08\x00\x00\x00\x01\x00\x02\x00\x03\x00\x00\x00idx\x00\xc8\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x14\x00\x06\x00\x05\x00\x08\x00\x0c\x00\x0c\x00\x00\x00\x00\x02\x02\x00\x14\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x08\x00\x12\x00\x08\x00\x04\x00\x08\x00\x00\x00\x18\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x00\x18\x00\x0c\x00\x04\x00\x08\x00\n\x00\x00\x00d\x00\x00\x00\x10\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x0b\x00\x00\x00\x0f\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00orangeapplepeargrape\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
    cpu_data = np.ndarray(shape=len(schema_bytes), dtype=np.byte,
                          buffer=bytearray(schema_bytes))

    # Use GDF IPC parser
    schema_ptr = ffi.cast("void*", cpu_data.ctypes.data)

    ipch = libgdf.gdf_ipc_parser_open(schema_ptr, cpu_data.size)
    if libgdf.gdf_ipc_parser_failed(ipch):
        print(libgdf.gdf_ipc_parser_get_error(ipch))

    jsonraw = libgdf.gdf_ipc_parser_get_schema_json(ipch)
    jsontext = ffi.string(jsonraw).decode()
    json_schema = json.loads(jsontext)

    pprint(json_schema)

    recordbatches_bytes = b'\x1c\x01\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x16\x00\x06\x00\x05\x00\x08\x00\x0c\x00\x0c\x00\x00\x00\x00\x03\x02\x00\x18\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\n\x00\x18\x00\x0c\x00\x04\x00\x08\x00\n\x00\x00\x00\xac\x00\x00\x00\x10\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x10\x00\x00\x00\x11\x00\x00\x00\x12\x00\x00\x00\x13\x00\x00\x00\x04\x00\x00\x00\x05\x00\x00\x00\x06\x00\x00\x00\x07\x00\x00\x00\x14\x00\x00\x00\x15\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\x08\x00\x00\x00\t\x00\x00\x00\n\x00\x00\x00\x0b\x00\x00\x00\x18\x00\x00\x00\x19\x00\x00\x00\x1a\x00\x00\x00\x1b\x00\x00\x00\x0c\x00\x00\x00\r\x00\x00\x00\x0e\x00\x00\x00\x0f\x00\x00\x00\x1c\x00\x00\x00\x1d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x16\x93\xb7<\xac*\xde?\x00Y\x94@"\x0eo?\xf8+\xee\xac\xf2#\xdc?\xa4\xcauw68\xe2?\xf8\xaa\xc9\x9f*\x9f\xda?\xe0\x1e\x1b-\x8b\xa4\xd7?\xe6y\x8a\x9b\xe4<\xef?\x08\x89\xc4.0W\xc5?h\xa5\x0f\x14\xa2\xe3\xbb?\xc0\xa9/\x8f\xeap\xb8?\x0c7\xed\x99fc\xda?:\tA.\xc6g\xda?\x1c\x1f)\xfd\x03\n\xc1?\xfe\x1e\xf9(/\xf0\xe3?\x08h\x99\x05\x81m\xe7?\xa0\xa8=\xfc\x96\x93\xcd?x\x8b\xf8v\xbe_\xc8?\xa2\xd9Zg\xd9\xb9\xed?;\xdb\xa6\xfas\xdb\xed?\xd8\xc9\xfcA-\xcd\xdd?@\xe27`\x0cQ\x94?d\x11:-\x8e\xcf\xd9?\xc9S\xde\xff\xbbN\xe5?\xe0o(\xf4s?\xba?\x0bq\xb9j%o\xeb?\x10\xe8\xa1t\t\x9b\xcb?\xa5\xf0\x15\t\x1ep\xed?\xc7\xb2~\x02\x82l\xef?0\xe6\xa8g\xec\x82\xc3?\xe0\xc6\xe8\xb1\xc2~\xd6?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
    rb_cpu_data = np.ndarray(shape=len(recordbatches_bytes), dtype=np.byte,
                             buffer=bytearray(recordbatches_bytes))
    rb_gpu_data = cuda.to_device(rb_cpu_data)
    del cpu_data

    devptr = ffi.cast("void*", rb_gpu_data.device_ctypes_pointer.value)
    libgdf.gdf_ipc_parser_open_recordbatches(ipch, devptr, rb_gpu_data.size)

    if libgdf.gdf_ipc_parser_failed(ipch):
        print(libgdf.gdf_ipc_parser_get_error(ipch))

    jsonraw = libgdf.gdf_ipc_parser_get_layout_json(ipch)
    jsontext = ffi.string(jsonraw).decode()
    json_rb = json.loads(jsontext)
    pprint(json_rb)

    offset = libgdf.gdf_ipc_parser_get_data_offset(ipch)

    libgdf.gdf_ipc_parser_close(ipch)

    # Check
    dicts = json_schema['dictionaries']
    assert len(dicts) == 1
    dictdata = dicts[0]['data']['columns'][0]['DATA']
    assert set(dictdata) == {'orange', 'apple', 'pear', 'grape'}

    gpu_data = rb_gpu_data[offset:]

    schema_fields = json_schema['schema']['fields']
    assert len(schema_fields) == 3
    field_names = [f['name'] for f in schema_fields]
    assert field_names == ['idx', 'name', 'weight']

    # check the dictionary id in schema
    assert schema_fields[1]['dictionary']['id'] == dicts[0]['id']

    # Get "idx" column
    idx_buf_off = json_rb[0]['data_buffer']['offset']
    idx_buf_len = json_rb[0]['data_buffer']['length']
    idx_buf = gpu_data[idx_buf_off:][:idx_buf_len]
    assert json_rb[0]['dtype']['name'] == 'INT32'
    idx_size = json_rb[0]['length']
    assert idx_size == 30
    idx_data = np.ndarray(shape=idx_size, dtype=np.int32,
                          buffer=idx_buf.copy_to_host())
    print(idx_data)

    # Get "name" column
    name_buf_off = json_rb[1]['data_buffer']['offset']
    name_buf_len = json_rb[1]['data_buffer']['length']
    name_buf = gpu_data[name_buf_off:][:name_buf_len]
    assert json_rb[1]['dtype']['name'] == 'DICTIONARY'
    name_size = json_rb[1]['length']
    name_data = np.ndarray(shape=name_size, dtype=np.int32,
                           buffer=name_buf.copy_to_host())
    print(name_data)

    # Get "name" column
    weight_buf_off = json_rb[2]['data_buffer']['offset']
    weight_buf_len = json_rb[2]['data_buffer']['length']
    weight_buf = gpu_data[weight_buf_off:][:weight_buf_len]
    assert json_rb[2]['dtype']['name'] == 'DOUBLE'
    weight_size = json_rb[2]['length']
    weight_data = np.ndarray(shape=weight_size, dtype=np.float64,
                             buffer=weight_buf.copy_to_host())
    print(weight_data)

    # verify data
    sortedidx = np.argsort(idx_data)
    idx_data = idx_data[sortedidx]
    name_data = name_data[sortedidx]
    weight_data = weight_data[sortedidx]

    got_iter = zip(idx_data, name_data, weight_data)
    for expected, got in zip(get_expected_values(), got_iter):
        assert expected[0] == got[0]
        assert expected[1] == dictdata[got[1]]
        assert expected[2] == got[2]
Esempio n. 18
0
def unwrap_devary(devary):
    ptrval = devary.device_ctypes_pointer.value
    ptrval = ptrval or ffi.NULL   # replace None with NULL
    return ffi.cast('void*', ptrval)
Esempio n. 19
0
File: _gdf.py Progetto: cuulee/cudf
def libgdf_join(col_lhs, col_rhs, on, how, method='sort'):
    joiner = _join_how_api[how]
    method_api = _join_method_api[method]
    gdf_context = ffi.new('gdf_context*')

    libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0)

    if how not in ['left', 'inner', 'outer']:
        msg = "new join api only supports left or inner"
        raise ValueError(msg)

    list_lhs = []
    list_rhs = []
    result_cols = []

    result_col_names = []

    left_idx = []
    right_idx = []
    # idx = 0
    for name, col in col_lhs.items():
        list_lhs.append(col._column.cffi_view)
        if name not in on:
            result_cols.append(columnview(0, None, dtype=col._column.dtype))
            result_col_names.append(name)

    for name in on:
        result_cols.append(columnview(0, None,
                                      dtype=col_lhs[name]._column.dtype))
        result_col_names.append(name)
        left_idx.append(list(col_lhs.keys()).index(name))
        right_idx.append(list(col_rhs.keys()).index(name))

    for name, col in col_rhs.items():
        list_rhs.append(col._column.cffi_view)
        if name not in on:
            result_cols.append(columnview(0, None, dtype=col._column.dtype))
            result_col_names.append(name)

    num_cols_to_join = len(on)
    result_num_cols = len(list_lhs) + len(list_rhs) - num_cols_to_join

    joiner(list_lhs,
           len(list_lhs),
           left_idx,
           list_rhs,
           len(list_rhs),
           right_idx,
           num_cols_to_join,
           result_num_cols,
           result_cols,
           ffi.NULL,
           ffi.NULL,
           gdf_context)

    res = []
    valids = []

    for col in result_cols:
        intaddr = int(ffi.cast("uintptr_t", col.data))
        res.append(rmm.device_array_from_ptr(ptr=intaddr,
                                             nelem=col.size,
                                             dtype=gdf_to_np_dtype(col.dtype),
                                             finalizer=rmm._make_finalizer(
                                                 intaddr, 0)))
        intaddr = int(ffi.cast("uintptr_t", col.valid))
        valids.append(rmm.device_array_from_ptr(ptr=intaddr,
                                                nelem=calc_chunk_size(
                                                    col.size, mask_bitsize),
                                                dtype=mask_dtype,
                                                finalizer=rmm._make_finalizer(
                                                    intaddr, 0)))

    return res, valids
Esempio n. 20
0
File: _gdf.py Progetto: cuulee/cudf
def unwrap_mask(devary):
    ptrval = devary.device_ctypes_pointer.value
    ptrval = ptrval or ffi.NULL   # replace None with NULL
    return ffi.cast('gdf_valid_type*', ptrval), ptrval
Esempio n. 21
0
File: _gdf.py Progetto: cuulee/cudf
def unwrap_devary(devary):
    ptrval = devary.device_ctypes_pointer.value
    ptrval = ptrval or ffi.NULL   # replace None with NULL
    return ffi.cast('void*', ptrval)
Esempio n. 22
0
def read_csv_strings(filepath_or_buffer,
                     lineterminator='\n',
                     quotechar='"',
                     quoting=0,
                     doublequote=True,
                     header='infer',
                     sep=',',
                     delimiter=None,
                     delim_whitespace=False,
                     skipinitialspace=False,
                     names=None,
                     dtype=None,
                     skipfooter=0,
                     skiprows=0,
                     dayfirst=False,
                     compression='infer',
                     thousands=None,
                     decimal='.',
                     true_values=None,
                     false_values=None,
                     nrows=None,
                     byte_range=None,
                     skip_blank_lines=True,
                     comment=None,
                     na_values=None,
                     keep_default_na=True,
                     na_filter=True,
                     prefix=None,
                     index_col=None):
    """
    **Experimental**: This function exists only as a beta way to use
    `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. with cudf.

    Future versions of cuDF will provide cleaner integration.

    Uses mostly same arguments as read_csv.
    Note: Doesn't currently support auto-column detection, header, usecols
    and mangle_dupe_cols args.

    Returns
    -------
    columns : ordered list of cudf.dataframe.Series and nvstrings objects
      numeric or date dtyped columns will be Series.

      'str' dtyped columns will be
      `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_.

    Examples
    --------

    Create a test csv file

    >>> import cudf
    >>> filename = 'foo.csv'
    >>> lines = [
    ...   "num1,datetime,text",
    ...   "123,2018-11-13T12:00:00,abc",
    ...   "456,2018-11-14T12:35:01,def",
    ...   "789,2018-11-15T18:02:59,ghi"
    ... ]
    >>> with open(filename, 'w') as fp:
    ...     fp.write('\\n'.join(lines)+'\\n')

    Read the file with cudf

    >>> names = ['num1', 'datetime', 'text']
    >>> dtypes = ['int', 'date', 'str']
    >>> columns = cudf.io.csv.read_csv_strings(filename, delimiter=',',
    ...                         names=names, dtype=dtypes,
    ...                         skiprows=1)

    Display results

    >>> print(columns[0])
    0  123
    1  456
    2  789
    >>> print(columns[2])
    ['abc', 'def', 'ghi']

    See Also
    --------
    .read_csv
    """
    import nvstrings
    from cudf.dataframe.series import Series

    # Alias sep -> delimiter.
    if delimiter is None:
        delimiter = sep

    if dtype is not None:
        if isinstance(dtype, collections.abc.Mapping):
            dtype_dict = True
        elif isinstance(dtype, collections.abc.Iterable):
            dtype_dict = False
        else:
            msg = '''dtype must be 'list like' or 'dict' '''
            raise TypeError(msg)
        if names is not None and len(dtype) != len(names):
            msg = '''All column dtypes must be specified.'''
            raise TypeError(msg)

    csv_reader = ffi.new('csv_read_arg*')

    # Populate csv_reader struct
    if is_file_like(filepath_or_buffer):
        buffer = filepath_or_buffer.read()
        # check if StringIO is used
        if hasattr(buffer, 'encode'):
            buffer_as_bytes = buffer.encode()
        else:
            buffer_as_bytes = buffer
        buffer_data_holder = ffi.new("char[]", buffer_as_bytes)

        csv_reader.input_data_form = libgdf.HOST_BUFFER
        csv_reader.filepath_or_buffer = buffer_data_holder
        csv_reader.buffer_size = len(buffer_as_bytes)
    else:
        if (not os.path.isfile(filepath_or_buffer)):
            raise (FileNotFoundError)
        if (not os.path.exists(filepath_or_buffer)):
            raise (FileNotFoundError)
        file_path = _wrap_string(filepath_or_buffer)

        csv_reader.input_data_form = libgdf.FILE_PATH
        csv_reader.filepath_or_buffer = file_path

    if header == 'infer':
        header = -1
    header_infer = header
    arr_names = []
    arr_dtypes = []
    if names is None:
        if header is -1:
            header_infer = 0
        if header is None:
            header_infer = -1
        csv_reader.names = ffi.NULL
        csv_reader.num_cols = 0
    else:
        if header is None:
            header_infer = -1
        csv_reader.num_cols = len(names)
        for col_name in names:
            arr_names.append(_wrap_string(col_name))
            if dtype is not None:
                if dtype_dict:
                    arr_dtypes.append(_wrap_string(str(dtype[col_name])))
        names_ptr = ffi.new('char*[]', arr_names)
        csv_reader.names = names_ptr

    if dtype is None:
        csv_reader.dtype = ffi.NULL
    else:
        if not dtype_dict:
            for col_dtype in dtype:
                arr_dtypes.append(_wrap_string(str(col_dtype)))
        dtype_ptr = ffi.new('char*[]', arr_dtypes)
        csv_reader.dtype = dtype_ptr

    if decimal == delimiter:
        raise ValueError("decimal cannot be the same as delimiter")

    if thousands == delimiter:
        raise ValueError("thousands cannot be the same as delimiter")

    if nrows is not None and skipfooter != 0:
        raise ValueError("cannot use both nrows and skipfooter parameters")

    if byte_range is not None:
        if skipfooter != 0 or skiprows != 0 or nrows is not None:
            raise ValueError("""cannot manually limit rows to be read when
                                using the byte range parameter""")

    # Start with default values recognized as boolean
    arr_true_values = [_wrap_string(str('True')), _wrap_string(str('TRUE'))]
    arr_false_values = [_wrap_string(str('False')), _wrap_string(str('FALSE'))]

    for value in true_values or []:
        arr_true_values.append(_wrap_string(str(value)))
    arr_true_values_ptr = ffi.new('char*[]', arr_true_values)
    csv_reader.true_values = arr_true_values_ptr
    csv_reader.num_true_values = len(arr_true_values)

    for value in false_values or []:
        arr_false_values.append(_wrap_string(str(value)))
    false_values_ptr = ffi.new('char*[]', arr_false_values)
    csv_reader.false_values = false_values_ptr
    csv_reader.num_false_values = len(arr_false_values)

    arr_na_values = []
    for value in na_values or []:
        arr_na_values.append(_wrap_string(str(value)))
    arr_na_values_ptr = ffi.new('char*[]', arr_na_values)
    csv_reader.na_values = arr_na_values_ptr
    csv_reader.num_na_values = len(arr_na_values)

    compression_bytes = _wrap_string(compression)
    prefix_bytes = _wrap_string(prefix)

    csv_reader.delimiter = delimiter.encode()
    csv_reader.lineterminator = lineterminator.encode()
    csv_reader.quotechar = quotechar.encode()
    csv_reader.quoting = _quoting_enum[quoting]
    csv_reader.doublequote = doublequote
    csv_reader.delim_whitespace = delim_whitespace
    csv_reader.skipinitialspace = skipinitialspace
    csv_reader.dayfirst = dayfirst
    csv_reader.header = header_infer
    csv_reader.skiprows = skiprows
    csv_reader.skipfooter = skipfooter
    csv_reader.compression = compression_bytes
    csv_reader.decimal = decimal.encode()
    csv_reader.thousands = thousands.encode() if thousands else b'\0'
    csv_reader.nrows = nrows if nrows is not None else -1
    if byte_range is not None:
        csv_reader.byte_range_offset = byte_range[0]
        csv_reader.byte_range_size = byte_range[1]
    else:
        csv_reader.byte_range_offset = 0
        csv_reader.byte_range_size = 0
    csv_reader.skip_blank_lines = skip_blank_lines
    csv_reader.comment = comment.encode() if comment else b'\0'
    csv_reader.keep_default_na = keep_default_na
    csv_reader.na_filter = na_filter
    csv_reader.prefix = prefix_bytes

    # Call read_csv
    libgdf.read_csv(csv_reader)

    out = csv_reader.data
    if out == ffi.NULL:
        raise ValueError("Failed to parse CSV")

    # Extract parsed columns

    outcols = []
    for i in range(csv_reader.num_cols_out):
        if out[i].dtype == libgdf.GDF_STRING:
            ptr = int(ffi.cast("uintptr_t", out[i].data))
            outcols.append(nvstrings.bind_cpointer(ptr))
        else:
            newcol = Column.from_cffi_view(out[i])
            if (newcol.dtype == np.dtype('datetime64[ms]')):
                col = newcol.view(DatetimeColumn, dtype='datetime64[ms]')
            else:
                col = newcol.view(NumericalColumn, dtype=newcol.dtype)
            outcols.append(Series(col))

    return outcols
Esempio n. 23
0
def read_csv(filepath_or_buffer,
             lineterminator='\n',
             quotechar='"',
             quoting=0,
             doublequote=True,
             header='infer',
             mangle_dupe_cols=True,
             usecols=None,
             sep=',',
             delimiter=None,
             delim_whitespace=False,
             skipinitialspace=False,
             names=None,
             dtype=None,
             skipfooter=0,
             skiprows=0,
             dayfirst=False,
             compression='infer',
             thousands=None,
             decimal='.',
             true_values=None,
             false_values=None,
             nrows=None,
             byte_range=None,
             skip_blank_lines=True,
             comment=None,
             na_values=None,
             keep_default_na=True,
             na_filter=True,
             prefix=None,
             index_col=None):
    """
    Load and parse a CSV file into a DataFrame

    Parameters
    ----------
    filepath_or_buffer : str
        Path of file to be read or a file-like object containing the file.
    sep : char, default ','
        Delimiter to be used.
    delimiter : char, default None
        Alternative argument name for sep.
    delim_whitespace : bool, default False
        Determines whether to use whitespace as delimiter.
    lineterminator : char, default '\\n'
        Character to indicate end of line.
    skipinitialspace : bool, default False
        Skip spaces after delimiter.
    names : list of str, default None
        List of column names to be used.
    dtype : list of str or dict of {col: dtype}, default None
        List of data types in the same order of the column names
        or a dictionary with column_name:dtype (pandas style).
    quotechar : char, default '"'
        Character to indicate start and end of quote item.
    quoting : str or int, default 0
        Controls quoting behavior. Set to one of
        0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL),
        2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE).
        Quoting is enabled with all values except 3.
    doublequote : bool, default True
        When quoting is enabled, indicates whether to interpret two
        consecutive quotechar inside fields as single quotechar
    header : int, default 'infer'
        Row number to use as the column names. Default behavior is to infer
        the column names: if no names are passed, header=0;
        if column names are passed explicitly, header=None.
    usecols : list of int or str, default None
        Returns subset of the columns given in the list. All elements must be
        either integer indices (column number) or strings that correspond to
        column names
    mangle_dupe_cols : boolean, default True
        Duplicate columns will be specified as 'X','X.1',...'X.N'.
    skiprows : int, default 0
        Number of rows to be skipped from the start of file.
    skipfooter : int, default 0
        Number of rows to be skipped at the bottom of file.
    compression : {'infer', 'gzip', 'zip', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If ‘infer’, then detect
        compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no
        decompression). If using ‘zip’, the ZIP file must contain only one
        data file to be read in, otherwise the first non-zero-sized file will
        be used. Set to None for no decompression.
    decimal : char, default '.'
        Character used as a decimal point.
    thousands : char, default None
        Character used as a thousands delimiter.
    true_values : list, default None
        Values to consider as boolean True
    false_values : list, default None
        Values to consider as boolean False
    nrows : int, default None
        If specified, maximum number of rows to read
    byte_range : list or tuple, default None
        Byte range within the input file to be read. The first number is the
        offset in bytes, the second number is the range size in bytes. Set the
        size to zero to read all data after the offset location. Reads the row
        that starts before or at the end of the range, even if it ends after
        the end of the range.
    skip_blank_lines : bool, default True
        If True, discard and do not parse empty lines
        If False, interpret empty lines as NaN values
    comment : char, default None
        Character used as a comments indicator. If found at the beginning of a
        line, the line will be ignored altogether.
    na_values : list, default None
        Values to consider as invalid
    keep_default_na : bool, default True
        Whether or not to include the default NA values when parsing the data.
    na_filter : bool, default True
        Detect missing values (empty strings and the values in na_values).
        Passing False can improve performance.
    prefix : str, default None
        Prefix to add to column numbers when parsing without a header row
    index_col : int or string, default None
        Column to use as the row labels

    Returns
    -------
    GPU ``DataFrame`` object.

    Examples
    --------

    Create a test csv file

    >>> import cudf
    >>> filename = 'foo.csv'
    >>> lines = [
    ...   "num1,datetime,text",
    ...   "123,2018-11-13T12:00:00,abc",
    ...   "456,2018-11-14T12:35:01,def",
    ...   "789,2018-11-15T18:02:59,ghi"
    ... ]
    >>> with open(filename, 'w') as fp:
    ...     fp.write('\\n'.join(lines)+'\\n')

    Read the file with ``cudf.read_csv``

    >>> cudf.read_csv(filename)
      num1                datetime text
    0  123 2018-11-13T12:00:00.000 5451
    1  456 2018-11-14T12:35:01.000 5784
    2  789 2018-11-15T18:02:59.000 6117
    """

    if delim_whitespace:
        if delimiter is not None:
            raise ValueError("cannot set both delimiter and delim_whitespace")
        if sep != ',':
            raise ValueError("cannot set both sep and delim_whitespace")

    # Alias sep -> delimiter.
    if delimiter is None:
        delimiter = sep

    if dtype is not None:
        if isinstance(dtype, collections.abc.Mapping):
            dtype_dict = True
        elif isinstance(dtype, collections.abc.Iterable):
            dtype_dict = False
        else:
            msg = '''dtype must be 'list like' or 'dict' '''
            raise TypeError(msg)
        if names is not None and len(dtype) != len(names):
            msg = '''All column dtypes must be specified.'''
            raise TypeError(msg)

    nvtx_range_push("CUDF_READ_CSV", "purple")

    csv_reader = ffi.new('csv_read_arg*')

    # Populate csv_reader struct
    if is_file_like(filepath_or_buffer):
        if compression == 'infer':
            compression = None
        buffer = filepath_or_buffer.read()
        # check if StringIO is used
        if hasattr(buffer, 'encode'):
            buffer_as_bytes = buffer.encode()
        else:
            buffer_as_bytes = buffer
        buffer_data_holder = ffi.new("char[]", buffer_as_bytes)

        csv_reader.input_data_form = libgdf.HOST_BUFFER
        csv_reader.filepath_or_buffer = buffer_data_holder
        csv_reader.buffer_size = len(buffer_as_bytes)
    else:
        if (not os.path.isfile(filepath_or_buffer)):
            raise (FileNotFoundError)
        if (not os.path.exists(filepath_or_buffer)):
            raise (FileNotFoundError)
        file_path = _wrap_string(filepath_or_buffer)

        csv_reader.input_data_form = libgdf.FILE_PATH
        csv_reader.filepath_or_buffer = file_path

    if header == 'infer':
        header = -1
    header_infer = header
    arr_names = []
    arr_dtypes = []
    if names is None:
        if header is -1:
            header_infer = 0
        if header is None:
            header_infer = -1
        csv_reader.names = ffi.NULL
        csv_reader.num_cols = 0
    else:
        if header is None:
            header_infer = -1
        csv_reader.num_cols = len(names)
        for col_name in names:
            arr_names.append(_wrap_string(col_name))
            if dtype is not None:
                if dtype_dict:
                    arr_dtypes.append(_wrap_string(str(dtype[col_name])))
        names_ptr = ffi.new('char*[]', arr_names)
        csv_reader.names = names_ptr

    if dtype is None:
        csv_reader.dtype = ffi.NULL
    else:
        if not dtype_dict:
            for col_dtype in dtype:
                arr_dtypes.append(_wrap_string(str(col_dtype)))
        dtype_ptr = ffi.new('char*[]', arr_dtypes)
        csv_reader.dtype = dtype_ptr

    csv_reader.use_cols_int = ffi.NULL
    csv_reader.use_cols_int_len = 0
    csv_reader.use_cols_char = ffi.NULL
    csv_reader.use_cols_char_len = 0

    if usecols is not None:
        arr_col_names = []
        if (all(isinstance(x, int) for x in usecols)):
            usecols_ptr = ffi.new('int[]', usecols)
            csv_reader.use_cols_int = usecols_ptr
            csv_reader.use_cols_int_len = len(usecols)
        else:
            for col_name in usecols:
                arr_col_names.append(_wrap_string(col_name))
            col_names_ptr = ffi.new('char*[]', arr_col_names)
            csv_reader.use_cols_char = col_names_ptr
            csv_reader.use_cols_char_len = len(usecols)

    if decimal == delimiter:
        raise ValueError("decimal cannot be the same as delimiter")

    if thousands == delimiter:
        raise ValueError("thousands cannot be the same as delimiter")

    if nrows is not None and skipfooter != 0:
        raise ValueError("cannot use both nrows and skipfooter parameters")

    if byte_range is not None:
        if skipfooter != 0 or skiprows != 0 or nrows is not None:
            raise ValueError("""cannot manually limit rows to be read when
                                using the byte range parameter""")

    arr_true_values = []
    for value in true_values or []:
        arr_true_values.append(_wrap_string(str(value)))
    arr_true_values_ptr = ffi.new('char*[]', arr_true_values)
    csv_reader.true_values = arr_true_values_ptr
    csv_reader.num_true_values = len(arr_true_values)

    arr_false_values = []
    for value in false_values or []:
        arr_false_values.append(_wrap_string(str(value)))
    false_values_ptr = ffi.new('char*[]', arr_false_values)
    csv_reader.false_values = false_values_ptr
    csv_reader.num_false_values = len(arr_false_values)

    arr_na_values = []
    for value in na_values or []:
        arr_na_values.append(_wrap_string(str(value)))
    arr_na_values_ptr = ffi.new('char*[]', arr_na_values)
    csv_reader.na_values = arr_na_values_ptr
    csv_reader.num_na_values = len(arr_na_values)

    compression_bytes = _wrap_string(compression)
    prefix_bytes = _wrap_string(prefix)

    csv_reader.delimiter = delimiter.encode()
    csv_reader.lineterminator = lineterminator.encode()
    csv_reader.quotechar = quotechar.encode()
    csv_reader.quoting = _quoting_enum[quoting]
    csv_reader.doublequote = doublequote
    csv_reader.delim_whitespace = delim_whitespace
    csv_reader.skipinitialspace = skipinitialspace
    csv_reader.dayfirst = dayfirst
    csv_reader.header = header_infer
    csv_reader.skiprows = skiprows
    csv_reader.skipfooter = skipfooter
    csv_reader.mangle_dupe_cols = mangle_dupe_cols
    csv_reader.windowslinetermination = False
    csv_reader.compression = compression_bytes
    csv_reader.decimal = decimal.encode()
    csv_reader.thousands = thousands.encode() if thousands else b'\0'
    csv_reader.nrows = nrows if nrows is not None else -1
    if byte_range is not None:
        csv_reader.byte_range_offset = byte_range[0]
        csv_reader.byte_range_size = byte_range[1]
    else:
        csv_reader.byte_range_offset = 0
        csv_reader.byte_range_size = 0
    csv_reader.skip_blank_lines = skip_blank_lines
    csv_reader.comment = comment.encode() if comment else b'\0'
    csv_reader.keep_default_na = keep_default_na
    csv_reader.na_filter = na_filter
    csv_reader.prefix = prefix_bytes

    # Call read_csv
    libgdf.read_csv(csv_reader)

    out = csv_reader.data
    if out == ffi.NULL:
        raise ValueError("Failed to parse CSV")

    # Extract parsed columns

    outcols = []
    new_names = []
    for i in range(csv_reader.num_cols_out):
        if out[i].dtype == libgdf.GDF_STRING:
            ptr = int(ffi.cast("uintptr_t", out[i].data))
            new_names.append(ffi.string(out[i].col_name).decode())
            outcols.append(nvstrings.bind_cpointer(ptr))
        else:
            newcol = Column.from_cffi_view(out[i])
            new_names.append(ffi.string(out[i].col_name).decode())
            if (newcol.dtype.type == np.datetime64):
                outcols.append(
                    newcol.view(DatetimeColumn, dtype='datetime64[ms]'))
            else:
                outcols.append(newcol.view(NumericalColumn,
                                           dtype=newcol.dtype))

    # Build dataframe
    df = DataFrame()
    # if names is not None and header_infer is -1:

    for k, v in zip(new_names, outcols):
        df[k] = v

    # Set index if the index_col parameter is passed
    if index_col is not None and index_col is not False:
        if isinstance(index_col, (int)):
            df = df.set_index(df.columns[index_col])
        else:
            df = df.set_index(index_col)

    nvtx_range_pop()

    return df
Esempio n. 24
0
 def unwrap(buffer):
     if buffer is None:
         return ffi.NULL
     devary = buffer.to_gpu_array()
     return ffi.cast('void*', devary.device_ctypes_pointer.value)
Esempio n. 25
0
def read_csv_strings(filepath,
                     lineterminator='\n',
                     quotechar='"',
                     quoting=True,
                     doublequote=True,
                     delimiter=',',
                     sep=None,
                     delim_whitespace=False,
                     skipinitialspace=False,
                     names=None,
                     dtype=None,
                     skipfooter=0,
                     skiprows=0,
                     dayfirst=False):

    import nvstrings
    from cudf.dataframe.series import Series
    """
    **Experimental**: This function provided only as an alpha way of providing
    a way to use nvstrings alongside cudf.
    Future versions of cuDF will provide cleaner integration.

    Uses the same arguments as read_csv.

    Returns list of Series objects for numeric or date columns and nvstrings
    objects for those columns that are strings (dtype='str').

    Examples
    --------
    foo.txt : ::

        50,abc|40,def|30,ghi|20,jkl|

    .. code-block:: python

      import cudf
      fn = 'foo.txt'
      cols = cudf.io.read_csv_strings(fn, delimiter=',', lineterminator='|',
                           names=['col1', 'col2'], dtype=['int64', 'str'],
                           skiprows=1, skipfooter=1)
      type(cols[0])
      print(cols[0])

      type(cols[1])
      print(cols[1])

    Output:

    .. code-block:: python

      <class 'cudf.series.Series'>
      0 40
      1 30

      <class 'nvstrings.nvstrings'>
      ['def', 'ghi']

    """

    if names is None or dtype is None:
        msg = '''Automatic dtype detection not implemented:
        Column names and dtypes must be specified.'''
        raise TypeError(msg)

    if isinstance(dtype, dict):
        dtype_dict = True
    elif isinstance(dtype, list):
        dtype_dict = False
        if len(dtype) != len(names):
            msg = '''All column dtypes must be specified.'''
            raise TypeError(msg)
    else:
        msg = '''dtype must be 'list' or 'dict' '''
        raise TypeError(msg)

    csv_reader = ffi.new('csv_read_arg*')

    # Populate csv_reader struct
    file_path = _wrap_string(filepath)
    csv_reader.file_path = file_path

    arr_names = []
    arr_dtypes = []
    for col_name in names:
        arr_names.append(_wrap_string(col_name))
        if dtype_dict:
            arr_dtypes.append(_wrap_string(str(dtype[col_name])))
    names_ptr = ffi.new('char*[]', arr_names)
    csv_reader.names = names_ptr

    if not dtype_dict:
        for col_dtype in dtype:
            arr_dtypes.append(_wrap_string(str(col_dtype)))
    dtype_ptr = ffi.new('char*[]', arr_dtypes)
    csv_reader.dtype = dtype_ptr

    csv_reader.delimiter = delimiter.encode()
    csv_reader.lineterminator = lineterminator.encode()
    csv_reader.quotechar = quotechar.encode()
    csv_reader.quoting = quoting
    csv_reader.doublequote = doublequote
    csv_reader.delim_whitespace = delim_whitespace
    csv_reader.skipinitialspace = skipinitialspace
    csv_reader.dayfirst = dayfirst
    csv_reader.num_cols = len(names)
    csv_reader.skiprows = skiprows
    csv_reader.skipfooter = skipfooter

    # Call read_csv
    libgdf.read_csv(csv_reader)

    out = csv_reader.data
    if out == ffi.NULL:
        raise ValueError("Failed to parse CSV")

    # Extract parsed columns

    outcols = []
    for i in range(csv_reader.num_cols_out):
        if out[i].dtype == libgdf.GDF_STRING:
            ptr = int(ffi.cast("uintptr_t", out[i].data))
            outcols.append(nvstrings.bind_cpointer(ptr))
        else:
            newcol = Column.from_cffi_view(out[i])
            if (newcol.dtype == np.dtype('datetime64[ms]')):
                col = newcol.view(DatetimeColumn, dtype='datetime64[ms]')
            else:
                col = newcol.view(NumericalColumn, dtype=newcol.dtype)
            outcols.append(Series(col))

    return outcols
Esempio n. 26
0
def test_ipc():
    # make gpu array
    TESTDATA = b"\x00\x01\x00\x00\x10\x00\x00\x00\x0c\x00\x0e\x00\x06\x00\x05\x00\x08\x00\x00\x00\x0c\x00\x00\x00\x00\x01\x01\x00\x10\x00\x00\x00\x00\x00\n\x00\x08\x00\x00\x00\x04\x00\x00\x00\n\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00l\x00\x00\x00\x04\x00\x00\x00\xb0\xff\xff\xff\x00\x00\x01\x038\x00\x00\x00\x1c\x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00\x1c\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x9a\xff\xff\xff\x00\x00\x01\x00\x8c\xff\xff\xff \x00\x01\x00\x94\xff\xff\xff\x01\x00\x02\x00\x08\x00\x00\x00dest_lon\x00\x00\x00\x00\x14\x00\x18\x00\x08\x00\x06\x00\x07\x00\x0c\x00\x00\x00\x10\x00\x14\x00\x00\x00\x14\x00\x00\x00\x00\x00\x01\x03H\x00\x00\x00$\x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00,\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x08\x00\x06\x00\x06\x00\x00\x00\x00\x00\x01\x00\xf8\xff\xff\xff \x00\x01\x00\x08\x00\x08\x00\x04\x00\x06\x00\x08\x00\x00\x00\x01\x00\x02\x00\x08\x00\x00\x00dest_lat\x00\x00\x00\x00\xd8\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x16\x00\x06\x00\x05\x00\x08\x00\x0c\x00\x0c\x00\x00\x00\x00\x03\x01\x00\x18\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\n\x00\x18\x00\x0c\x00\x04\x00\x08\x00\n\x00\x00\x00|\x00\x00\x00\x10\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x91\xa7\x06B\x91\xa7\x06B\x91\xa7\x06B\xc4\xcd\xdfA\x91\xa7\x06B\xc4\xcd\xdfA\xe7\xea\nB\x9c\xb3\x1cB\xe7\xea\nB\x9c\xb3\x1cB\xe7\xea\nB]n\xe3A\xe7\xea\nB\xd9$\'Brc\x03BL\x8a\xffArc\x03B\xd9$\'Brc\x03BL\x8a\xffArc\x03Bt@\x06B\x03o\x1fB\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00C\xa5\xcb\xc2C\xa5\xcb\xc2C\xa5\xcb\xc2\x06\x11\xa5\xc2C\xa5\xcb\xc2\x06\x11\xa5\xc2\xd0r\xb8\xc2\x1eV\x99\xc2\xd0r\xb8\xc2\x1eV\x99\xc2\xd0r\xb8\xc2\xce\xa1\xa2\xc2\xd0r\xb8\xc2>\x81\xaf\xc2\x1b\xb4\xc1\xc2ag\xcc\xc2\x1b\xb4\xc1\xc2>\x81\xaf\xc2\x1b\xb4\xc1\xc2ag\xcc\xc2\x1b\xb4\xc1\xc2\xd1\x81\xad\xc2\x81U\xd1\xc2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
    cpu_data = np.ndarray(shape=len(TESTDATA),
                          dtype=np.byte,
                          buffer=bytearray(TESTDATA))
    gpu_data = cuda.to_device(cpu_data)
    del cpu_data

    # Use GDF IPC parser
    devptr = ffi.cast("void*", gpu_data.device_ctypes_pointer.value)
    ipcparser = libgdf.gdf_ipc_parser_open(devptr)
    assert not libgdf.gdf_ipc_parser_failed(ipcparser)
    assert not ffi.string(libgdf.gdf_ipc_parser_get_error(ipcparser))

    # get schema as json
    jsonraw = libgdf.gdf_ipc_parser_to_json(ipcparser)
    jsontext = ffi.string(jsonraw).decode()
    jsonparsed = json.loads(jsontext)
    pprint(jsonparsed)

    dataptr = libgdf.gdf_ipc_parser_get_data_offset(ipcparser)
    dataptr = int(ffi.cast('uint64_t', dataptr))
    data_region = gpu_data[dataptr:]
    print(data_region.shape)

    def get_column(schema):
        offset = schema['data_buffer']['offset']
        raw_size = schema['data_buffer']['length']
        size = schema['length']
        assert schema['dtype']['bitwidth'] == 32
        assert schema['dtype']['name'] == 'FloatingPoint'
        raw_data_col1 = data_region[offset:offset + raw_size]
        assert raw_data_col1.size == raw_size

        dtype = np.dtype(np.float32)
        itemsize = dtype.itemsize
        ary = DeviceNDArray(shape=(raw_size // itemsize, ),
                            strides=(itemsize, ),
                            dtype=dtype,
                            gpu_data=raw_data_col1.gpu_data)
        hary = ary[:size].copy_to_host()
        return hary

    # Get first column
    schema_col1 = jsonparsed[0]
    name_col1 = schema_col1['name']
    assert name_col1 == 'dest_lat'

    dest_lat = get_column(schema_col1)

    # Get second column
    schema_col2 = jsonparsed[1]
    name_col2 = schema_col2['name']
    assert name_col2 == 'dest_lon'

    dest_lon = get_column(schema_col2)

    libgdf.gdf_ipc_parser_close(ipcparser)

    # Check data integrity
    np.testing.assert_array_less(dest_lat, 42)
    np.testing.assert_array_less(27, dest_lat)
    np.testing.assert_array_less(dest_lon, -76)
    np.testing.assert_array_less(-105, dest_lon)
Esempio n. 27
0
    def _apply_agg(self, agg_type, result, add_col_values,
                   ctx, val_columns, val_columns_out, sort_result=True):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        result : DataFrame
            The DataFrame to store the result of the aggregation into.
        add_col_values : bool
            Boolean to indicate whether this is the first aggregation being
            run and should add the additional columns' values.
        ctx : gdf_context cffi object
            Context object to pass information such as if the dataframe
            is sorted and/or which method to use for grouping.
        val_columns : list of *str*
            The list of column names that the aggregation should be performed
            on.
        val_columns_out : list of *str*
            The list of columns names that the aggregation results should be
            output into.
        """

        if sort_result:
            ctx.flag_sort_result = 1

        ncols = len(self._by)
        cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by]

        first_run = add_col_values
        need_to_index = self._as_index

        col_count = 0
        if isinstance(val_columns, (str, Number)):
            val_columns = [val_columns]
        for val_col in val_columns:
            col_agg = self._df[val_col]._column.cffi_view

            # assuming here that if there are multiple aggregations that the
            # aggregated results will be in the same order for GDF_SORT method
            if need_to_index:
                out_col_indices_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.int32
                        )
                    )
                )
                out_col_indices = out_col_indices_series._column.cffi_view
            else:
                out_col_indices = ffi.NULL

            out_col_values_series = []
            for i in range(0, ncols):
                if self._df[self._by[i]].dtype == np.dtype('object'):
                    # This isn't ideal, but no better way to create an
                    # nvstrings object of correct size
                    gather_map = zeros(col_agg.size, dtype='int32')
                    col = Series([''], dtype='str')[gather_map]\
                        .reset_index(drop=True)
                else:
                    col = Series(
                        Buffer(
                            rmm.device_array(
                                col_agg.size,
                                dtype=self._df[self._by[i]]._column.data.dtype
                            )
                        )
                    )
                out_col_values_series.append(col)
            out_col_values = [
                out_col_values_series[i]._column.cffi_view
                for i in range(0, ncols)]

            if agg_type == "count":
                out_col_agg_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.int64
                        )
                    )
                )
            elif agg_type == "mean":
                out_col_agg_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.float64
                        )
                    )
                )
            else:
                if self._df[val_col].dtype == np.dtype('object'):
                    # This isn't ideal, but no better way to create an
                    # nvstrings object of correct size
                    gather_map = zeros(col_agg.size, dtype='int32')
                    out_col_agg_series = Series(
                        [''],
                        dtype='str'
                    )[gather_map].reset_index(drop=True)
                else:
                    out_col_agg_series = Series(
                        Buffer(
                            rmm.device_array(
                                col_agg.size,
                                dtype=self._df[val_col]._column.data.dtype
                            )
                        )
                    )

            out_col_agg = out_col_agg_series._column.cffi_view

            agg_func = self._NAMED_FUNCTIONS.get(agg_type, None)
            if agg_func is None:
                raise RuntimeError(
                    "ERROR: this aggregator has not been implemented yet")

            err = agg_func(
                ncols,
                cols,
                col_agg,
                out_col_indices,
                out_col_values,
                out_col_agg,
                ctx)

            if (err is not None):
                raise RuntimeError(err)

            num_row_results = out_col_agg.size

            # NVStrings columns are not the same going in as coming out but we
            # can't create entire CFFI views otherwise multiple objects will
            # try to free the memory
            for i, col in enumerate(out_col_values_series):
                if col.dtype == np.dtype("object") and len(col) > 0:
                    import nvcategory
                    nvcat_ptr = int(
                        ffi.cast(
                            "uintptr_t",
                            out_col_values[i].dtype_info.category
                        )
                    )
                    nvcat_obj = None
                    if nvcat_ptr:
                        nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
                        nvstr_obj = nvcat_obj.to_strings()
                    else:
                        import nvstrings
                        nvstr_obj = nvstrings.to_device([])
                    out_col_values_series[i]._column._data = nvstr_obj
                    out_col_values_series[i]._column._nvcategory = nvcat_obj
            if out_col_agg_series.dtype == np.dtype("object") and \
                    len(out_col_agg_series) > 0:
                import nvcategory
                nvcat_ptr = int(
                    ffi.cast(
                        "uintptr_t",
                        out_col_agg.dtype_info.category
                    )
                )
                nvcat_obj = None
                if nvcat_ptr:
                    nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
                    nvstr_obj = nvcat_obj.to_strings()
                else:
                    import nvstrings
                    nvstr_obj = nvstrings.to_device([])
                out_col_agg_series._column._data = nvstr_obj
                out_col_agg_series._column._nvcategory = nvcat_obj

            if first_run:
                for i, thisBy in enumerate(self._by):
                    result[thisBy] = out_col_values_series[i][
                        :num_row_results]

                    if is_categorical_dtype(self._df[thisBy].dtype):
                        result[thisBy] = CategoricalColumn(
                            data=result[thisBy].data,
                            categories=self._df[thisBy].cat.categories,
                            ordered=self._df[thisBy].cat.ordered
                        )

            if out_col_agg_series.dtype != np.dtype("object"):
                out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index(drop=True)

            if isinstance(val_columns_out, (str, Number)):
                result[val_columns_out] = out_col_agg_series[:num_row_results]
            else:
                result[val_columns_out[col_count]
                       ] = out_col_agg_series[:num_row_results]

            if out_col_agg_series.dtype != np.dtype("object"):
                out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index(drop=True)

            first_run = False
            col_count = col_count + 1

        return result
Esempio n. 28
0
def unwrap_devary(devary):
    return ffi.cast('void*', devary.device_ctypes_pointer.value)
Esempio n. 29
0
def read_csv_strings(filepath_or_buffer, lineterminator='\n',
                     quotechar='"', quoting=True, doublequote=True,
                     sep=',', delimiter=None, delim_whitespace=False,
                     skipinitialspace=False, names=None, dtype=None,
                     skipfooter=0, skiprows=0, dayfirst=False,
                     compression='infer', thousands=None, decimal='.',
                     true_values=None, false_values=None, nrows=None):

    """
    **Experimental**: This function exists only as a beta way to use
    `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. with cudf.

    Future versions of cuDF will provide cleaner integration.

    Uses mostly same arguments as read_csv.
    Note: Doesn't currently support auto-column detection, header, usecols
    and mangle_dupe_cols args.

    Returns
    -------
    columns : ordered list of cudf.dataframe.Series and nvstrings objects
      numeric or date dtyped columns will be Series.

      'str' dtyped columns will be
      `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_.

    Examples
    --------

    .. code-block:: python

      import cudf

      # Create a test csv file
      filename = 'foo.csv'
      lines = [
        "num1,datetime,text",
        "123,2018-11-13T12:00:00,abc",
        "456,2018-11-14T12:35:01,def",
        "789,2018-11-15T18:02:59,ghi"
      ]
      with open(filename, 'w') as fp:
          fp.write('\\n'.join(lines)+'\\n')

      # Read the file with cudf
      names = ['num1', 'datetime', 'text']
      dtypes = ['int', 'date', 'str']
      columns = cudf.io.csv.read_csv_strings(filename, delimiter=',',
                              names=names, dtype=dtypes,
                              skiprows=1)
      # Display results
      columns[0]
      print(columns[0])
      columns[2]
      print(columns[2])

    Output:

    .. code-block:: python

      <cudf.Series nrows=3 >
      0  123
      1  456
      2  789

      <nvstrings count=3>
      ['abc', 'def', 'ghi']

    See Also
    --------
    .read_csv
    """
    import nvstrings
    from cudf.dataframe.series import Series

    if names is None or dtype is None:
        msg = '''Automatic dtype detection not implemented:
        Column names and dtypes must be specified.'''
        raise TypeError(msg)

    # Alias sep -> delimiter.
    if delimiter is None:
        delimiter = sep

    if isinstance(dtype, dict):
        dtype_dict = True
    elif isinstance(dtype, list):
        dtype_dict = False
        if len(dtype) != len(names):
            msg = '''All column dtypes must be specified.'''
            raise TypeError(msg)
    else:
        msg = '''dtype must be 'list' or 'dict' '''
        raise TypeError(msg)

    csv_reader = ffi.new('csv_read_arg*')

    # Populate csv_reader struct
    if is_file_like(filepath_or_buffer):
        buffer = filepath_or_buffer.read()
        # check if StringIO is used
        if hasattr(buffer, 'encode'):
            buffer_as_bytes = buffer.encode()
        else:
            buffer_as_bytes = buffer
        buffer_data_holder = ffi.new("char[]", buffer_as_bytes)

        csv_reader.input_data_form = libgdf.HOST_BUFFER
        csv_reader.filepath_or_buffer = buffer_data_holder
        csv_reader.buffer_size = len(buffer_as_bytes)
    else:
        file_path = _wrap_string(filepath_or_buffer)

        csv_reader.input_data_form = libgdf.FILE_PATH
        csv_reader.filepath_or_buffer = file_path

    arr_names = []
    arr_dtypes = []
    for col_name in names:
        arr_names.append(_wrap_string(col_name))
        if dtype_dict:
            arr_dtypes.append(_wrap_string(str(dtype[col_name])))
    names_ptr = ffi.new('char*[]', arr_names)
    csv_reader.names = names_ptr

    if not dtype_dict:
        for col_dtype in dtype:
            arr_dtypes.append(_wrap_string(str(col_dtype)))
    dtype_ptr = ffi.new('char*[]', arr_dtypes)
    csv_reader.dtype = dtype_ptr

    if decimal == delimiter:
        raise ValueError("decimal cannot be the same as delimiter")

    if thousands == delimiter:
        raise ValueError("thousands cannot be the same as delimiter")

    if nrows is not None and skipfooter != 0:
        raise ValueError("cannot use both nrows and skipfooter parameters")

    # Start with default values recognized as boolean
    arr_true_values = [_wrap_string(str('True')), _wrap_string(str('TRUE'))]
    arr_false_values = [_wrap_string(str('False')), _wrap_string(str('FALSE'))]

    for value in true_values or []:
        arr_true_values.append(_wrap_string(str(value)))
    arr_true_values_ptr = ffi.new('char*[]', arr_true_values)
    csv_reader.true_values = arr_true_values_ptr
    csv_reader.num_true_values = len(arr_true_values)

    for value in false_values or []:
        arr_false_values.append(_wrap_string(str(value)))
    false_values_ptr = ffi.new('char*[]', arr_false_values)
    csv_reader.false_values = false_values_ptr
    csv_reader.num_false_values = len(arr_false_values)

    compression_bytes = _wrap_string(compression)

    csv_reader.delimiter = delimiter.encode()
    csv_reader.lineterminator = lineterminator.encode()
    csv_reader.quotechar = quotechar.encode()
    csv_reader.quoting = quoting
    csv_reader.doublequote = doublequote
    csv_reader.delim_whitespace = delim_whitespace
    csv_reader.skipinitialspace = skipinitialspace
    csv_reader.dayfirst = dayfirst
    csv_reader.num_cols = len(names)
    csv_reader.skiprows = skiprows
    csv_reader.skipfooter = skipfooter
    csv_reader.compression = compression_bytes
    csv_reader.decimal = decimal.encode()
    csv_reader.thousands = thousands.encode() if thousands else b'\0'
    csv_reader.nrows = nrows if nrows is not None else -1

    # Call read_csv
    libgdf.read_csv(csv_reader)

    out = csv_reader.data
    if out == ffi.NULL:
        raise ValueError("Failed to parse CSV")

    # Extract parsed columns

    outcols = []
    for i in range(csv_reader.num_cols_out):
        if out[i].dtype == libgdf.GDF_STRING:
            ptr = int(ffi.cast("uintptr_t", out[i].data))
            outcols.append(nvstrings.bind_cpointer(ptr))
        else:
            newcol = Column.from_cffi_view(out[i])
            if(newcol.dtype == np.dtype('datetime64[ms]')):
                col = newcol.view(DatetimeColumn, dtype='datetime64[ms]')
            else:
                col = newcol.view(NumericalColumn, dtype=newcol.dtype)
            outcols.append(Series(col))

    return outcols
Esempio n. 30
0
def test_ipc():

    batch = make_batch()
    schema_bytes = batch.schema.serialize().to_pybytes()
    recordbatches_bytes = batch.serialize().to_pybytes()

    cpu_data = np.ndarray(shape=len(schema_bytes),
                          dtype=np.byte,
                          buffer=bytearray(schema_bytes))

    # Use GDF IPC parser
    schema_ptr = ffi.cast("void*", cpu_data.ctypes.data)
    ipch = libgdf.gdf_ipc_parser_open(schema_ptr, cpu_data.size)

    if libgdf.gdf_ipc_parser_failed(ipch):
        assert 0, str(ffi.string(libgdf.gdf_ipc_parser_get_error(ipch)))
    jsonraw = libgdf.gdf_ipc_parser_get_schema_json(ipch)
    jsontext = ffi.string(jsonraw).decode()
    json_schema = json.loads(jsontext)
    print('json_schema:')
    pprint(json_schema)

    rb_cpu_data = np.ndarray(shape=len(recordbatches_bytes),
                             dtype=np.byte,
                             buffer=bytearray(recordbatches_bytes))
    rb_gpu_data = rmm.to_device(rb_cpu_data)
    del cpu_data

    devptr = ffi.cast("void*", rb_gpu_data.device_ctypes_pointer.value)

    libgdf.gdf_ipc_parser_open_recordbatches(ipch, devptr, rb_gpu_data.size)

    if libgdf.gdf_ipc_parser_failed(ipch):
        assert 0, str(ffi.string(libgdf.gdf_ipc_parser_get_error(ipch)))

    jsonraw = libgdf.gdf_ipc_parser_get_layout_json(ipch)
    jsontext = ffi.string(jsonraw).decode()
    json_rb = json.loads(jsontext)
    print('json_rb:')
    pprint(json_rb)

    offset = libgdf.gdf_ipc_parser_get_data_offset(ipch)

    libgdf.gdf_ipc_parser_close(ipch)

    # Check
    dicts = json_schema['dictionaries']
    assert len(dicts) == 1
    dictdata = dicts[0]['data']['columns'][0]['DATA']
    assert set(dictdata) == {'orange', 'apple', 'pear', 'grape'}

    gpu_data = rb_gpu_data[offset:]

    schema_fields = json_schema['schema']['fields']
    assert len(schema_fields) == 3
    field_names = [f['name'] for f in schema_fields]
    assert field_names == ['idx', 'name', 'weight']

    # check the dictionary id in schema
    assert schema_fields[1]['dictionary']['id'] == dicts[0]['id']

    # Get "idx" column
    idx_buf_off = json_rb[0]['data_buffer']['offset']
    idx_buf_len = json_rb[0]['data_buffer']['length']
    idx_buf = gpu_data[idx_buf_off:][:idx_buf_len]
    assert json_rb[0]['dtype']['name'] == 'INT32'
    idx_size = json_rb[0]['length']
    assert idx_size == 30
    idx_data = np.ndarray(shape=idx_size,
                          dtype=np.int32,
                          buffer=idx_buf.copy_to_host())
    print('idx_data:')
    print(idx_data)

    # Get "name" column
    name_buf_off = json_rb[1]['data_buffer']['offset']
    name_buf_len = json_rb[1]['data_buffer']['length']
    name_buf = gpu_data[name_buf_off:][:name_buf_len]
    assert json_rb[1]['dtype']['name'] == 'DICTIONARY'
    name_size = json_rb[1]['length']
    name_data = np.ndarray(shape=name_size,
                           dtype=np.int32,
                           buffer=name_buf.copy_to_host())
    print('name_data:')
    print(name_data)

    # Get "weight" column
    weight_buf_off = json_rb[2]['data_buffer']['offset']
    weight_buf_len = json_rb[2]['data_buffer']['length']
    weight_buf = gpu_data[weight_buf_off:][:weight_buf_len]
    assert json_rb[2]['dtype']['name'] == 'DOUBLE'
    weight_size = json_rb[2]['length']
    weight_data = np.ndarray(shape=weight_size,
                             dtype=np.float64,
                             buffer=weight_buf.copy_to_host())
    print('weight_data:')
    print(weight_data)

    # verify data
    sortedidx = np.argsort(idx_data)
    idx_data = idx_data[sortedidx]
    name_data = name_data[sortedidx]
    weight_data = weight_data[sortedidx]

    got_iter = zip(idx_data, name_data, weight_data)
    for expected, got in zip(get_expected_values(), got_iter):
        assert expected[0] == got[0]
        assert expected[1] == dictdata[got[1]]
        assert expected[2] == got[2]