Esempio n. 1
0
def csv_reader_infer_nb_pandas_type(
    filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False
):

    # infer column types from the first block (similarly as Arrow does this)
    # TO-DO: tune the block size or allow user configure it via env var
    rows_to_read = 1000
    df = pd.read_csv(filepath_or_buffer, delimiter=delimiter, names=names,
                     usecols=usecols, dtype=dtype, skiprows=skiprows, nrows=rows_to_read,
                     parse_dates=parse_dates)

    try:
        df_type = numba.typeof(df)
    except ValueError:
        nb_col_types = []
        for col_name in df.columns:
            try:
                series_type = numba.typeof(df[col_name])
                col_type = series_type.data
            except ValueError:
                col_type = string_array_type
            nb_col_types.append(col_type)

        nb_col_types = tuple(nb_col_types)
        nb_col_names = tuple(df.columns)
        column_loc, _, _ = get_structure_maps(nb_col_types, nb_col_names)
        df_type = DataFrameType(nb_col_types, PositionalIndexType(), nb_col_names, column_loc=column_loc)

    return df_type
Esempio n. 2
0
def typeof_pd_dataframe(val, c):

    col_names = tuple(val.columns.tolist())
    # TODO: support other types like string and timestamp
    col_types = get_hiframes_dtypes(val)
    index_type = _infer_index_type(val.index)
    column_loc, _, _ = get_structure_maps(col_types, col_names)

    return DataFrameType(col_types, index_type, col_names, True, column_loc=column_loc)
Esempio n. 3
0
def unbox_dataframe(typ, val, c):
    """unbox dataframe to an empty DataFrame struct
    columns will be extracted later if necessary.
    """
    n_cols = len(typ.columns)
    # create dataframe struct and store values
    dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder)

    errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit)

    _, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns)

    for col_typ in types_order:
        type_id, col_indices = data_typs_map[col_typ]
        n_type_cols = len(col_indices)
        list_type = types.List(col_typ)
        ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, list_type, n_type_cols)

        with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok):
            with if_ok:
                inst.size = c.context.get_constant(types.intp, n_type_cols)
                for i, col_idx in enumerate(col_indices):
                    series_obj = c.pyapi.object_getattr_string(val, typ.columns[col_idx])
                    arr_obj = c.pyapi.object_getattr_string(series_obj, "values")
                    ty_series = typ.data[col_idx]

                    # FIXME: CategoricalType has wrong dtype attribute value (i.e. dtype of codes)
                    # current implementation offers pd_dtype for this purpose, so use it
                    column_dtype = ty_series.pd_dtype if isinstance(ty_series, Categorical) else ty_series.dtype
                    native_val = _unbox_series_data(column_dtype, ty_series, arr_obj, c)

                    inst.setitem(c.context.get_constant(types.intp, i), native_val.value, incref=False)
                    c.pyapi.decref(arr_obj)
                    c.pyapi.decref(series_obj)

                dataframe.data = c.builder.insert_value(dataframe.data, inst.value, type_id)

            with if_not_ok:
                c.builder.store(cgutils.true_bit, errorptr)

        # If an error occurred, drop the whole native list
        with c.builder.if_then(c.builder.load(errorptr)):
            c.context.nrt.decref(c.builder, list_type, inst.value)

    index_obj = c.pyapi.object_getattr_string(val, "index")
    dataframe.index = _unbox_index_data(typ.index, index_obj, c).value
    c.pyapi.decref(index_obj)

    dataframe.parent = val

    return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))
Esempio n. 4
0
def _gen_pandas_read_csv_func_text(col_names,
                                   col_typs,
                                   py_col_dtypes,
                                   usecols,
                                   signature=None):

    func_name = 'csv_reader_py'
    return_columns = usecols if usecols and isinstance(usecols[0],
                                                       str) else col_names

    column_loc, _, _ = get_structure_maps(col_typs, return_columns)
    df_type = DataFrameType(tuple(col_typs),
                            types.none,
                            tuple(col_names),
                            column_loc=column_loc)

    df_type_repr = repr(df_type)
    # for some reason pandas and pyarrow read_csv() return CategoricalDtype with
    # ordered=False in case when dtype is with ordered=None
    df_type_repr = df_type_repr.replace('ordered=None', 'ordered=False')

    # TODO: support non-numpy types like strings
    date_inds = ", ".join(
        str(i) for i, t in enumerate(col_typs)
        if t.dtype == types.NPDatetime('ns'))
    return_columns = usecols if usecols and isinstance(usecols[0],
                                                       str) else col_names

    if signature is None:
        signature = "filepath_or_buffer"

    # map generated func params into values used in inner call of pandas_read_csv
    # if no transformation is needed just use outer param name (since APIs match)
    # otherwise use value in the dictionary
    inner_call_params = {'parse_dates': f"[{date_inds}]"}
    used_read_csv_params = ('filepath_or_buffer', 'names', 'skiprows',
                            'parse_dates', 'dtype', 'usecols', 'sep',
                            'delimiter')

    # pyarrow reads unnamed header as " ", pandas reads it as "Unnamed: N"
    # during inference from file names should be raplaced with "Unnamed: N"
    # passing names to pyarrow means that one row is header and should be skipped
    if col_names and any(map(lambda x: x.startswith('Unnamed: '), col_names)):
        inner_call_params['names'] = str(col_names)
        inner_call_params['skiprows'] = "(skiprows and skiprows + 1) or 1"

    # dtype parameter of compiled function is not used at all, instead a python dict
    # of columns dtypes is captured at compile time, because some dtypes (like datetime)
    # are converted and also to avoid penalty of creating dict in objmode
    inner_call_params['dtype'] = 'read_as_dtypes'

    params_str = '\n'.join([
        f"      {param}={inner_call_params.get(param, param)},"
        for param in used_read_csv_params
    ])
    func_text = '\n'.join([
        f"def {func_name}({signature}):",
        f"  with objmode(df=\"{df_type_repr}\"):",
        f"    df = pandas_read_csv(\n{params_str}", f"    )", f"  return df"
    ])

    global_vars = {
        'read_as_dtypes': py_col_dtypes,
        'objmode': objmode,
        'pandas_read_csv': pandas_read_csv,
    }

    return func_text, func_name, global_vars
Esempio n. 5
0
def unbox_dataframe(typ, val, c):
    """unbox dataframe to an empty DataFrame struct
    columns will be extracted later if necessary.
    """
    n_cols = len(typ.columns)
    column_strs = [
        numba.cpython.unicode.make_string_from_constant(
            c.context, c.builder, string_type, a) for a in typ.columns
    ]
    # create dataframe struct and store values
    dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder)

    errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit)

    col_list_type = types.List(string_type)
    ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder,
                                                col_list_type, n_cols)

    with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok):
        with if_ok:
            inst.size = c.context.get_constant(types.intp, n_cols)
            for i, column_str in enumerate(column_strs):
                inst.setitem(c.context.get_constant(types.intp, i),
                             column_str,
                             incref=False)
            dataframe.columns = inst.value

        with if_not_ok:
            c.builder.store(cgutils.true_bit, errorptr)

    # If an error occurred, drop the whole native list
    with c.builder.if_then(c.builder.load(errorptr)):
        c.context.nrt.decref(c.builder, col_list_type, inst.value)

    _, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns)

    for col_typ in types_order:
        type_id, col_indices = data_typs_map[col_typ]
        n_type_cols = len(col_indices)
        list_type = types.List(col_typ)
        ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder,
                                                    list_type, n_type_cols)

        with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok):
            with if_ok:
                inst.size = c.context.get_constant(types.intp, n_type_cols)
                for i, col_idx in enumerate(col_indices):
                    series_obj = c.pyapi.object_getattr_string(
                        val, typ.columns[col_idx])
                    arr_obj = c.pyapi.object_getattr_string(
                        series_obj, "values")
                    ty_series = typ.data[col_idx]
                    if isinstance(ty_series, types.Array):
                        native_val = unbox_array(typ.data[col_idx], arr_obj, c)
                    elif ty_series == string_array_type:
                        native_val = unbox_str_series(string_array_type,
                                                      series_obj, c)

                    inst.setitem(c.context.get_constant(types.intp, i),
                                 native_val.value,
                                 incref=False)

                dataframe.data = c.builder.insert_value(
                    dataframe.data, inst.value, type_id)

            with if_not_ok:
                c.builder.store(cgutils.true_bit, errorptr)

        # If an error occurred, drop the whole native list
        with c.builder.if_then(c.builder.load(errorptr)):
            c.context.nrt.decref(c.builder, list_type, inst.value)

    index_obj = c.pyapi.object_getattr_string(val, "index")
    dataframe.index = _unbox_index_data(typ.index, index_obj, c).value
    c.pyapi.decref(index_obj)

    dataframe.parent = val

    # increase refcount of stored values
    if c.context.enable_nrt:
        # TODO: other objects?
        for var in column_strs:
            c.context.nrt.incref(c.builder, string_type, var)

    return NativeValue(dataframe._getvalue(),
                       is_error=c.builder.load(errorptr))