def test_jit_explicit_signature(self): def _check_explicit_signature(sig): f = jit(sig, nopython=True)(add_usecase) # Just a sanity check args = DT(1, 'ms'), TD(2, 'us') expected = add_usecase(*args) self.assertPreciseEqual(f(*args), expected) # Test passing the signature in object form sig = types.NPDatetime('us')(types.NPDatetime('ms'), types.NPTimedelta('us')) _check_explicit_signature(sig) # Same with the signature in string form sig = "NPDatetime('us')(NPDatetime('ms'), NPTimedelta('us'))" _check_explicit_signature(sig)
def _get_pd_dtype_str(t): dtype = t.dtype if isinstance(t, Categorical): return 'pd.{}'.format(t.pd_dtype) if dtype == types.NPDatetime('ns'): dtype = 'str' if t == string_array_type: return 'str' return 'np.{}'.format(dtype)
def generic(self, args, kws): if len(args) == 1: # Guard against unary - return dt, td = args if isinstance(dt, types.NPDatetime) and isinstance( td, types.NPTimedelta): unit = npdatetime_helpers.combine_datetime_timedelta_units( dt.unit, td.unit) if unit is not None: return signature(types.NPDatetime(unit), dt, td)
def _from_datetime_dtype(dtype): m = re_datetimestr.match(dtype.str) if not m: raise NotImplementedError(dtype) groups = m.groups() typecode = groups[0] unit = groups[2] or '' if typecode == 'm': return types.NPTimedelta(unit) elif typecode == 'M': return types.NPDatetime(unit) else: raise NotImplementedError(dtype)
def make_datetime_specific(outputs, dt_unit, td_unit): new_outputs = [] for out in outputs: if isinstance(out, types.NPDatetime) and out.unit == "": unit = npdatetime_helpers.combine_datetime_timedelta_units( dt_unit, td_unit) if unit is None: raise TypeError(f"ufunc '{ufunc_name}' is not " + "supported between " + f"datetime64[{dt_unit}] " + f"and timedelta64[{td_unit}]") new_outputs.append(types.NPDatetime(unit)) else: new_outputs.append(out) return new_outputs
def _gen_csv_reader_py_pyarrow_func_text_core(col_names, col_typs, dtype_present, usecols, signature=None): # TODO: support non-numpy types like strings date_inds = ", ".join(str(i) for i, t in enumerate(col_typs) if t.dtype == types.NPDatetime('ns')) return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names nb_objmode_vars = ", ".join([ '{}="{}"'.format(to_varname(cname), _get_dtype_str(t)) for cname, t in zip(return_columns, col_typs) ]) pd_dtype_strs = ", ".join([ "'{}': {}".format(cname, _get_pd_dtype_str(t)) for cname, t in zip(return_columns, col_typs) ]) if signature is None: signature = "filepath_or_buffer" func_text = "def csv_reader_py({}):\n".format(signature) func_text += " with objmode({}):\n".format(nb_objmode_vars) func_text += " df = pandas_read_csv(filepath_or_buffer,\n" # pyarrow reads unnamed header as " ", pandas reads it as "Unnamed: N" # during inference from file names should be raplaced with "Unnamed: N" # passing names to pyarrow means that one row is header and should be skipped if col_names and any(map(lambda x: x.startswith('Unnamed: '), col_names)): func_text += " names={},\n".format(col_names) func_text += " skiprows=(skiprows and skiprows + 1) or 1,\n" else: func_text += " names=names,\n" func_text += " skiprows=skiprows,\n" func_text += " parse_dates=[{}],\n".format(date_inds) # Python objects (e.g. str, np.float) could not be jitted and passed to objmode # so they are hardcoded to function # func_text += " dtype={{{}}},\n".format(pd_dtype_strs) if dtype_present else \ # " dtype=dtype,\n" # dtype is hardcoded because datetime should be read as string func_text += " dtype={{{}}},\n".format(pd_dtype_strs) func_text += " usecols=usecols,\n" func_text += " sep=sep,\n" func_text += " delimiter=delimiter,\n" func_text += " )\n" for cname in return_columns: func_text += " {} = df['{}'].values\n".format(to_varname(cname), cname) # func_text += " print({})\n".format(cname) return func_text, 'csv_reader_py'
def _get_dtype_str(t): dtype = t.dtype if isinstance(t, Categorical): # return categorical representation # for some reason pandas and pyarrow read_csv() return CategoricalDtype with # ordered=False in case when dtype is with ordered=None return str(t).replace('ordered=None', 'ordered=False') if dtype == types.NPDatetime('ns'): dtype = 'NPDatetime("ns")' if t == string_array_type: # HACK: add string_array_type to numba.types # FIXME: fix after Numba #3372 is resolved types.string_array_type = string_array_type return 'string_array_type' return '{}[::1]'.format(dtype)
def generic(self, args, kws): if len(args) == 1: # Guard against unary + return left, right = args if isinstance(right, types.NPTimedelta): dt = left td = right elif isinstance(left, types.NPTimedelta): dt = right td = left else: return if isinstance(dt, types.NPDatetime): unit = npdatetime_helpers.combine_datetime_timedelta_units( dt.unit, td.unit) if unit is not None: return signature(types.NPDatetime(unit), left, right)
def test_call_notation(self): # Function call signature i = types.int32 d = types.double self.assertEqual(i(), typing.signature(i)) self.assertEqual(i(d), typing.signature(i, d)) self.assertEqual(i(d, d), typing.signature(i, d, d)) # Value cast self.assertPreciseEqual(i(42.5), 42) self.assertPreciseEqual(d(-5), -5.0) ty = types.NPDatetime('Y') self.assertPreciseEqual(ty('1900'), np.datetime64('1900', 'Y')) self.assertPreciseEqual(ty('NaT'), np.datetime64('NaT', 'Y')) ty = types.NPTimedelta('s') self.assertPreciseEqual(ty(5), np.timedelta64(5, 's')) self.assertPreciseEqual(ty('NaT'), np.timedelta64('NaT', 's')) ty = types.NPTimedelta('') self.assertPreciseEqual(ty(5), np.timedelta64(5)) self.assertPreciseEqual(ty('NaT'), np.timedelta64('NaT'))
def test_call_notation(self): # Function call signature i = types.int32 d = types.double self.assertEqual(i(), typing.signature(i)) self.assertEqual(i(d), typing.signature(i, d)) self.assertEqual(i(d, d), typing.signature(i, d, d)) # Value cast self.assertPreciseEqual(i(42.5), 42) self.assertPreciseEqual(d(-5), -5.0) ty = types.NPDatetime("Y") self.assertPreciseEqual(ty("1900"), np.datetime64("1900", "Y")) self.assertPreciseEqual(ty("NaT"), np.datetime64("NaT", "Y")) ty = types.NPTimedelta("s") self.assertPreciseEqual(ty(5), np.timedelta64(5, "s")) self.assertPreciseEqual(ty("NaT"), np.timedelta64("NaT", "s")) ty = types.NPTimedelta("") self.assertPreciseEqual(ty(5), np.timedelta64(5)) self.assertPreciseEqual(ty("NaT"), np.timedelta64("NaT"))
def test_atomic_types(self): for unit in ('M', 'ms'): ty = types.NPDatetime(unit) self.check_pickling(ty) ty = types.NPTimedelta(unit) self.check_pickling(ty)
def test_ufunc_find_matching_loop(self): f = numpy_support.ufunc_find_matching_loop np_add = FakeUFunc(_add_types) np_mul = FakeUFunc(_mul_types) np_isnan = FakeUFunc(_isnan_types) np_sqrt = FakeUFunc(_sqrt_types) def check(ufunc, input_types, sigs, output_types=()): """ Check that ufunc_find_matching_loop() finds one of the given *sigs* for *ufunc*, *input_types* and optional *output_types*. """ loop = f(ufunc, input_types + output_types) self.assertTrue(loop) if isinstance(sigs, str): sigs = (sigs,) self.assertIn( loop.ufunc_sig, sigs, "inputs=%s and outputs=%s should have selected one of %s, got %s" % (input_types, output_types, sigs, loop.ufunc_sig), ) self.assertEqual(len(loop.numpy_inputs), len(loop.inputs)) self.assertEqual(len(loop.numpy_outputs), len(loop.outputs)) if not output_types: # Add explicit outputs and check the result is the same loop_explicit = f(ufunc, list(input_types) + loop.outputs) self.assertEqual(loop_explicit, loop) else: self.assertEqual(loop.outputs, list(output_types)) # Round-tripping inputs and outputs loop_rt = f(ufunc, loop.inputs + loop.outputs) self.assertEqual(loop_rt, loop) return loop def check_exact(ufunc, input_types, sigs, output_types=()): """ Like check(), but also ensure no casting of inputs occurred. """ loop = check(ufunc, input_types, sigs, output_types) self.assertEqual(loop.inputs, list(input_types)) def check_no_match(ufunc, input_types): loop = f(ufunc, input_types) self.assertIs(loop, None) # Exact matching for number types check_exact(np_add, (types.bool_, types.bool_), "??->?") check_exact(np_add, (types.int8, types.int8), "bb->b") check_exact(np_add, (types.uint8, types.uint8), "BB->B") check_exact(np_add, (types.int64, types.int64), ("ll->l", "qq->q")) check_exact(np_add, (types.uint64, types.uint64), ("LL->L", "QQ->Q")) check_exact(np_add, (types.float32, types.float32), "ff->f") check_exact(np_add, (types.float64, types.float64), "dd->d") check_exact(np_add, (types.complex64, types.complex64), "FF->F") check_exact(np_add, (types.complex128, types.complex128), "DD->D") # Exact matching for datetime64 and timedelta64 types check_exact( np_add, (types.NPTimedelta("s"), types.NPTimedelta("s")), "mm->m", output_types=(types.NPTimedelta("s"),), ) check_exact( np_add, (types.NPTimedelta("ms"), types.NPDatetime("s")), "mM->M", output_types=(types.NPDatetime("ms"),), ) check_exact( np_add, (types.NPDatetime("s"), types.NPTimedelta("s")), "Mm->M", output_types=(types.NPDatetime("s"),), ) check_exact( np_mul, (types.NPTimedelta("s"), types.int64), "mq->m", output_types=(types.NPTimedelta("s"),), ) check_exact( np_mul, (types.float64, types.NPTimedelta("s")), "dm->m", output_types=(types.NPTimedelta("s"),), ) # Mix and match number types, with casting check(np_add, (types.bool_, types.int8), "bb->b") check(np_add, (types.uint8, types.bool_), "BB->B") check(np_add, (types.int16, types.uint16), "ii->i") check(np_add, (types.complex64, types.float64), "DD->D") check(np_add, (types.float64, types.complex64), "DD->D") # Integers, when used together with floating-point numbers, # should cast to any real or complex (see #2006) int_types = [types.int32, types.uint32, types.int64, types.uint64] for intty in int_types: check(np_add, (types.float32, intty), "ff->f") check(np_add, (types.float64, intty), "dd->d") check(np_add, (types.complex64, intty), "FF->F") check(np_add, (types.complex128, intty), "DD->D") # However, when used alone, they should cast only to # floating-point types of sufficient precision # (typical use case: np.sqrt(2) should give an accurate enough value) for intty in int_types: check(np_sqrt, (intty,), "d->d") check(np_isnan, (intty,), "d->?") # With some timedelta64 arguments as well check( np_mul, (types.NPTimedelta("s"), types.int32), "mq->m", output_types=(types.NPTimedelta("s"),), ) check( np_mul, (types.NPTimedelta("s"), types.uint32), "mq->m", output_types=(types.NPTimedelta("s"),), ) check( np_mul, (types.NPTimedelta("s"), types.float32), "md->m", output_types=(types.NPTimedelta("s"),), ) check( np_mul, (types.float32, types.NPTimedelta("s")), "dm->m", output_types=(types.NPTimedelta("s"),), ) # No match check_no_match(np_add, (types.NPDatetime("s"), types.NPDatetime("s"))) # No implicit casting from int64 to timedelta64 (Numpy would allow # this). check_no_match(np_add, (types.NPTimedelta("s"), types.int64))
def _gen_pandas_read_csv_func_text(col_names, col_typs, py_col_dtypes, usecols, signature=None): func_name = 'csv_reader_py' return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names column_loc, _, _ = get_structure_maps(col_typs, return_columns) df_type = DataFrameType(tuple(col_typs), types.none, tuple(col_names), column_loc=column_loc) df_type_repr = repr(df_type) # for some reason pandas and pyarrow read_csv() return CategoricalDtype with # ordered=False in case when dtype is with ordered=None df_type_repr = df_type_repr.replace('ordered=None', 'ordered=False') # TODO: support non-numpy types like strings date_inds = ", ".join( str(i) for i, t in enumerate(col_typs) if t.dtype == types.NPDatetime('ns')) return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names if signature is None: signature = "filepath_or_buffer" # map generated func params into values used in inner call of pandas_read_csv # if no transformation is needed just use outer param name (since APIs match) # otherwise use value in the dictionary inner_call_params = {'parse_dates': f"[{date_inds}]"} used_read_csv_params = ('filepath_or_buffer', 'names', 'skiprows', 'parse_dates', 'dtype', 'usecols', 'sep', 'delimiter') # pyarrow reads unnamed header as " ", pandas reads it as "Unnamed: N" # during inference from file names should be raplaced with "Unnamed: N" # passing names to pyarrow means that one row is header and should be skipped if col_names and any(map(lambda x: x.startswith('Unnamed: '), col_names)): inner_call_params['names'] = str(col_names) inner_call_params['skiprows'] = "(skiprows and skiprows + 1) or 1" # dtype parameter of compiled function is not used at all, instead a python dict # of columns dtypes is captured at compile time, because some dtypes (like datetime) # are converted and also to avoid penalty of creating dict in objmode inner_call_params['dtype'] = 'read_as_dtypes' params_str = '\n'.join([ f" {param}={inner_call_params.get(param, param)}," for param in used_read_csv_params ]) func_text = '\n'.join([ f"def {func_name}({signature}):", f" with objmode(df=\"{df_type_repr}\"):", f" df = pandas_read_csv(\n{params_str}", f" )", f" return df" ]) global_vars = { 'read_as_dtypes': py_col_dtypes, 'objmode': objmode, 'pandas_read_csv': pandas_read_csv, } return func_text, func_name, global_vars