def _gen_column_shift_pct(self, out_var, args, col_var, func): loc = col_var.loc if func == 'pct_change': shift_const = 1 if args: shift_const = get_constant(self.func_ir, args[0]) assert shift_const is not NOT_CONSTANT func_text = 'def g(a):\n return (a[0]-a[{}])/a[{}]\n'.format( -shift_const, -shift_const) else: assert func == 'shift' shift_const = get_constant(self.func_ir, args[0]) assert shift_const is not NOT_CONSTANT func_text = 'def g(a):\n return a[{}]\n'.format(-shift_const) loc_vars = {} exec(func_text, {}, loc_vars) kernel_func = loc_vars['g'] index_offsets = [0] fir_globals = self.func_ir.func_id.func.__globals__ stencil_nodes = gen_stencil_call(col_var, out_var, kernel_func, index_offsets, fir_globals) border_text = 'def f(A):\n A[0:{}] = np.nan\n'.format(shift_const) loc_vars = {} exec(border_text, {}, loc_vars) border_func = loc_vars['f'] f_blocks = compile_to_numba_ir(border_func, {'np': np}).blocks block = f_blocks[min(f_blocks.keys())] replace_arg_nodes(block, [out_var]) setitem_nodes = block.body[:-3] # remove none return return stencil_nodes + setitem_nodes
def _handle_rolling_setup(self, lhs, rhs): """ Handle Series rolling calls like: r = df.column.rolling(3) """ func_def = guard(get_definition, self.func_ir, rhs.func) assert func_def is not None # rare case where function variable is assigned to a new variable if isinstance(func_def, ir.Var): rhs.func = func_def return self._handle_rolling_setup(lhs, rhs) # df.column.rolling if (isinstance(func_def, ir.Expr) and func_def.op == 'getattr' and func_def.value.name in self.df_cols and func_def.attr == 'rolling'): center = False kws = dict(rhs.kws) if rhs.args: window = rhs.args[0] elif 'window' in kws: window = kws['window'] else: raise ValueError("window argument to rolling() required") window = get_constant(self.func_ir, window, window) if 'center' in kws: center = get_constant(self.func_ir, kws['center'], center) self.rolling_calls[lhs.name] = [func_def.value, window, center] return [] # remove return None
def _handle_merge(self, lhs, rhs): if guard(find_callname, self.func_ir, rhs) == ('merge', 'pandas'): if len(rhs.args) < 2: raise ValueError("left and right arguments required for merge") left_df = rhs.args[0] right_df = rhs.args[1] kws = dict(rhs.kws) if 'on' in kws: left_on = get_constant(self.func_ir, kws['on'], None) right_on = left_on else: # pragma: no cover if 'left_on' not in kws or 'right_on' not in kws: raise ValueError("merge 'on' or 'left_on'/'right_on'" "arguments required") left_on = get_constant(self.func_ir, kws['left_on'], None) right_on = get_constant(self.func_ir, kws['right_on'], None) if left_on is None or right_on is None: raise ValueError("merge key values should be constant strings") scope = lhs.scope loc = lhs.loc self.df_vars[lhs.name] = {} # add columns from left to output for col, _ in self.df_vars[left_df.name].items(): self.df_vars[lhs.name][col] = ir.Var(scope, mk_unique_var(col), loc) # add columns from right to output for col, _ in self.df_vars[right_df.name].items(): self.df_vars[lhs.name][col] = ir.Var(scope, mk_unique_var(col), loc) self._update_df_cols() return [ hiframes_join.Join(lhs.name, left_df.name, right_df.name, left_on, right_on, self.df_vars, lhs.loc) ] return None
def _handle_str_contains(self, lhs, rhs): """ Handle string contains like: B = df.column.str.contains('oo*', regex=True) """ func_def = guard(get_definition, self.func_ir, rhs.func) assert func_def is not None # rare case where function variable is assigned to a new variable if isinstance(func_def, ir.Var): rhs.func = func_def return self._handle_str_contains(lhs, rhs) str_col = guard(self._get_str_contains_col, func_def) if str_col is None: return None kws = dict(rhs.kws) pat = rhs.args[0] regex = True # default regex arg is True if 'regex' in kws: regex = get_constant(self.func_ir, kws['regex'], regex) if regex: def f(str_arr, pat): hpat.hiframes_api.str_contains_regex(str_arr, pat) else: def f(str_arr, pat): hpat.hiframes_api.str_contains_noregex(str_arr, pat) f_block = compile_to_numba_ir(f, {'hpat': hpat}).blocks.popitem()[1] replace_arg_nodes(f_block, [str_col, pat]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = lhs return nodes
def _get_dset_type(self, lhs, file_var, dset_var): """get data set type from user-specified locals types or actual file""" if lhs in self.local_vars: return self.local_vars[lhs] if self.reverse_copies[lhs] in self.local_vars: return self.local_vars[self.reverse_copies[lhs]] # read type from file if file name and dset name are constant values # TODO: check for file availability file_name = get_constant(self.func_ir, file_var) dset_str = get_constant(self.func_ir, dset_var) if file_name is not NOT_CONSTANT and dset_str is not NOT_CONSTANT: f = h5py.File(file_name, "r") ndims = len(f[dset_str].shape) numba_dtype = numpy_support.from_dtype(f[dset_str].dtype) return types.Array(numba_dtype, ndims, 'C') raise RuntimeError("data set type not found")
def _process_df_build_map(self, items_list): df_cols = {} for item in items_list: col_var = item[0] if isinstance(col_var, str): col_name = col_var else: col_name = get_constant(self.func_ir, col_var) if col_name is NOT_CONSTANT: raise ValueError("data frame column names should be constant") df_cols[col_name] = item[1] return df_cols
def gen_xe_init_from_uri(func_ir, dset_name_var): dset_name = get_constant(func_ir, dset_name_var) if dset_name is NOT_CONSTANT: raise ValueError("Xenon dataset should be a constant string") if dset_name.count("/") != 1: raise ValueError("invalid Xenon address {}".format(dset_name)) address, dset_name = dset_name.split("/") from .. import hxe_ext schema = hxe_ext.get_schema(address, dset_name) col_names, col_types = parse_xe_schema(schema) out_nodes, xe_connect_var, xe_dset_var = gen_init_xenon(address, dset_name) return out_nodes, col_names, col_types, xe_connect_var, xe_dset_var
def _gen_fillna(self, out_var, args, col_var, kws): inplace = False if 'inplace' in kws: inplace = get_constant(self.func_ir, kws['inplace']) if inplace == NOT_CONSTANT: raise ValueError("inplace arg to fillna should be constant") if inplace: out_var = col_var # output array is same as input array alloc_nodes = [] else: alloc_nodes = gen_empty_like(col_var, out_var) val = args[0] def f(A, B, fill): hpat.hiframes_api.fillna(A, B, fill) f_block = compile_to_numba_ir(f, {'hpat': hpat}).blocks.popitem()[1] replace_arg_nodes(f_block, [out_var, col_var, val]) nodes = f_block.body[:-3] # remove none return return alloc_nodes + nodes
def _handle_read(assign, lhs, rhs, func_ir): if not hpat.config._has_xenon: raise ValueError("Xenon support not available") # TODO: init only once from .. import hxe_ext ll.add_symbol('get_column_size_xenon', hxe_ext.get_column_size_xenon) ll.add_symbol('c_read_xenon', hxe_ext.read_xenon_col) ll.add_symbol('c_read_xenon_parallel', hxe_ext.read_xenon_col_parallel) ll.add_symbol('c_read_xenon_col_str', hxe_ext.read_xenon_col_str) ll.add_symbol('c_read_xenon_col_str_parallel', hxe_ext.read_xenon_col_str_parallel) ll.add_symbol('c_xe_connect', hxe_ext.c_xe_connect) ll.add_symbol('c_xe_open', hxe_ext.c_xe_open) ll.add_symbol('c_xe_close', hxe_ext.c_xe_close) if len(rhs.args) not in [1, 3]: raise ValueError("read_xenon expects one or three argument but received {}".format(len(rhs.args))) if len(rhs.args) == 1: out_nodes, col_names, col_types, xe_connect_var, xe_dset_var = gen_xe_init_from_uri(func_ir, rhs.args[0]) else: assert len(rhs.args) == 3 xe_connect_var = rhs.args[0] xe_dset_var = rhs.args[1] schema = get_constant(func_ir, rhs.args[2]) if schema is NOT_CONSTANT: raise ValueError("Xenon schema should be a constant string") col_names, col_types = parse_xe_schema(schema) out_nodes = [] # generate array of schema types xe_typs = [str(get_xe_typ_enum(c_type)) for c_type in col_types] xe_typs_str = "np.array([" + ",".join(xe_typs) + "])" func_text = 'def f():\n schema_arr = {}\n'.format(xe_typs_str) loc_vars = {} exec(func_text, {}, loc_vars) schm_func = loc_vars['f'] f_block = compile_to_numba_ir(schm_func, {'np': np, }).blocks.popitem()[1] out_nodes += f_block.body[:-3] schema_arr_var = out_nodes[-1].target scope = rhs.args[0].scope loc = rhs.args[0].loc col_items = [] for i, cname in enumerate(col_names): # get column type from schema c_type = col_types[i] # create a variable for column and assign type varname = mk_unique_var(cname) cvar = ir.Var(scope, varname, loc) col_items.append((cname, cvar)) out_nodes += get_column_read_nodes(c_type, cvar, xe_connect_var, xe_dset_var, i, schema_arr_var) # we need to close in the URI case since we opened the connection/dataset if len(rhs.args) == 1: out_nodes += gen_close_xenon(xe_connect_var, xe_dset_var) return col_items, out_nodes