def csv_reader_infer_nb_pandas_type( filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False ): # infer column types from the first block (similarly as Arrow does this) # TO-DO: tune the block size or allow user configure it via env var rows_to_read = 1000 df = pd.read_csv(filepath_or_buffer, delimiter=delimiter, names=names, usecols=usecols, dtype=dtype, skiprows=skiprows, nrows=rows_to_read, parse_dates=parse_dates) try: df_type = numba.typeof(df) except ValueError: nb_col_types = [] for col_name in df.columns: try: series_type = numba.typeof(df[col_name]) col_type = series_type.data except ValueError: col_type = string_array_type nb_col_types.append(col_type) nb_col_types = tuple(nb_col_types) nb_col_names = tuple(df.columns) column_loc, _, _ = get_structure_maps(nb_col_types, nb_col_names) df_type = DataFrameType(nb_col_types, PositionalIndexType(), nb_col_names, column_loc=column_loc) return df_type
def typeof_pd_dataframe(val, c): col_names = tuple(val.columns.tolist()) # TODO: support other types like string and timestamp col_types = get_hiframes_dtypes(val) index_type = _infer_index_type(val.index) column_loc, _, _ = get_structure_maps(col_types, col_names) return DataFrameType(col_types, index_type, col_names, True, column_loc=column_loc)
def unbox_dataframe(typ, val, c): """unbox dataframe to an empty DataFrame struct columns will be extracted later if necessary. """ n_cols = len(typ.columns) # create dataframe struct and store values dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder) errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) _, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns) for col_typ in types_order: type_id, col_indices = data_typs_map[col_typ] n_type_cols = len(col_indices) list_type = types.List(col_typ) ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, list_type, n_type_cols) with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok): with if_ok: inst.size = c.context.get_constant(types.intp, n_type_cols) for i, col_idx in enumerate(col_indices): series_obj = c.pyapi.object_getattr_string(val, typ.columns[col_idx]) arr_obj = c.pyapi.object_getattr_string(series_obj, "values") ty_series = typ.data[col_idx] # FIXME: CategoricalType has wrong dtype attribute value (i.e. dtype of codes) # current implementation offers pd_dtype for this purpose, so use it column_dtype = ty_series.pd_dtype if isinstance(ty_series, Categorical) else ty_series.dtype native_val = _unbox_series_data(column_dtype, ty_series, arr_obj, c) inst.setitem(c.context.get_constant(types.intp, i), native_val.value, incref=False) c.pyapi.decref(arr_obj) c.pyapi.decref(series_obj) dataframe.data = c.builder.insert_value(dataframe.data, inst.value, type_id) with if_not_ok: c.builder.store(cgutils.true_bit, errorptr) # If an error occurred, drop the whole native list with c.builder.if_then(c.builder.load(errorptr)): c.context.nrt.decref(c.builder, list_type, inst.value) index_obj = c.pyapi.object_getattr_string(val, "index") dataframe.index = _unbox_index_data(typ.index, index_obj, c).value c.pyapi.decref(index_obj) dataframe.parent = val return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))
def _gen_pandas_read_csv_func_text(col_names, col_typs, py_col_dtypes, usecols, signature=None): func_name = 'csv_reader_py' return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names column_loc, _, _ = get_structure_maps(col_typs, return_columns) df_type = DataFrameType(tuple(col_typs), types.none, tuple(col_names), column_loc=column_loc) df_type_repr = repr(df_type) # for some reason pandas and pyarrow read_csv() return CategoricalDtype with # ordered=False in case when dtype is with ordered=None df_type_repr = df_type_repr.replace('ordered=None', 'ordered=False') # TODO: support non-numpy types like strings date_inds = ", ".join( str(i) for i, t in enumerate(col_typs) if t.dtype == types.NPDatetime('ns')) return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names if signature is None: signature = "filepath_or_buffer" # map generated func params into values used in inner call of pandas_read_csv # if no transformation is needed just use outer param name (since APIs match) # otherwise use value in the dictionary inner_call_params = {'parse_dates': f"[{date_inds}]"} used_read_csv_params = ('filepath_or_buffer', 'names', 'skiprows', 'parse_dates', 'dtype', 'usecols', 'sep', 'delimiter') # pyarrow reads unnamed header as " ", pandas reads it as "Unnamed: N" # during inference from file names should be raplaced with "Unnamed: N" # passing names to pyarrow means that one row is header and should be skipped if col_names and any(map(lambda x: x.startswith('Unnamed: '), col_names)): inner_call_params['names'] = str(col_names) inner_call_params['skiprows'] = "(skiprows and skiprows + 1) or 1" # dtype parameter of compiled function is not used at all, instead a python dict # of columns dtypes is captured at compile time, because some dtypes (like datetime) # are converted and also to avoid penalty of creating dict in objmode inner_call_params['dtype'] = 'read_as_dtypes' params_str = '\n'.join([ f" {param}={inner_call_params.get(param, param)}," for param in used_read_csv_params ]) func_text = '\n'.join([ f"def {func_name}({signature}):", f" with objmode(df=\"{df_type_repr}\"):", f" df = pandas_read_csv(\n{params_str}", f" )", f" return df" ]) global_vars = { 'read_as_dtypes': py_col_dtypes, 'objmode': objmode, 'pandas_read_csv': pandas_read_csv, } return func_text, func_name, global_vars
def unbox_dataframe(typ, val, c): """unbox dataframe to an empty DataFrame struct columns will be extracted later if necessary. """ n_cols = len(typ.columns) column_strs = [ numba.cpython.unicode.make_string_from_constant( c.context, c.builder, string_type, a) for a in typ.columns ] # create dataframe struct and store values dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder) errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) col_list_type = types.List(string_type) ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, col_list_type, n_cols) with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok): with if_ok: inst.size = c.context.get_constant(types.intp, n_cols) for i, column_str in enumerate(column_strs): inst.setitem(c.context.get_constant(types.intp, i), column_str, incref=False) dataframe.columns = inst.value with if_not_ok: c.builder.store(cgutils.true_bit, errorptr) # If an error occurred, drop the whole native list with c.builder.if_then(c.builder.load(errorptr)): c.context.nrt.decref(c.builder, col_list_type, inst.value) _, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns) for col_typ in types_order: type_id, col_indices = data_typs_map[col_typ] n_type_cols = len(col_indices) list_type = types.List(col_typ) ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, list_type, n_type_cols) with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok): with if_ok: inst.size = c.context.get_constant(types.intp, n_type_cols) for i, col_idx in enumerate(col_indices): series_obj = c.pyapi.object_getattr_string( val, typ.columns[col_idx]) arr_obj = c.pyapi.object_getattr_string( series_obj, "values") ty_series = typ.data[col_idx] if isinstance(ty_series, types.Array): native_val = unbox_array(typ.data[col_idx], arr_obj, c) elif ty_series == string_array_type: native_val = unbox_str_series(string_array_type, series_obj, c) inst.setitem(c.context.get_constant(types.intp, i), native_val.value, incref=False) dataframe.data = c.builder.insert_value( dataframe.data, inst.value, type_id) with if_not_ok: c.builder.store(cgutils.true_bit, errorptr) # If an error occurred, drop the whole native list with c.builder.if_then(c.builder.load(errorptr)): c.context.nrt.decref(c.builder, list_type, inst.value) index_obj = c.pyapi.object_getattr_string(val, "index") dataframe.index = _unbox_index_data(typ.index, index_obj, c).value c.pyapi.decref(index_obj) dataframe.parent = val # increase refcount of stored values if c.context.enable_nrt: # TODO: other objects? for var in column_strs: c.context.nrt.incref(c.builder, string_type, var) return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))