def csv_distributed_run(csv_node, array_dists, typemap, calltypes, typingctx, targetctx, dist_pass): parallel = True if sdc.config.config_transport_mpi: for v in csv_node.out_vars: if (array_dists[v.name] != distributed.Distribution.OneD and array_dists[v.name] != distributed.Distribution.OneD_Var): parallel = False else: parallel = False n_cols = len(csv_node.out_vars) # TODO: rebalance if output distributions are 1D instead of 1D_Var # get column variables arg_names = ", ".join("arr" + str(i) for i in range(n_cols)) func_text = "def csv_impl(fname):\n" func_text += " ({},) = _csv_reader_py(fname)\n".format(arg_names) # print(func_text) loc_vars = {} exec(func_text, {}, loc_vars) csv_impl = loc_vars['csv_impl'] csv_reader_py = _gen_csv_reader_py( csv_node.df_colnames, csv_node.out_types, csv_node.usecols, csv_node.sep, typingctx, targetctx, parallel, csv_node.skiprows) f_block = compile_to_numba_ir(csv_impl, {'_csv_reader_py': csv_reader_py}, typingctx, (string_type,), typemap, calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [csv_node.file_name]) nodes = f_block.body[:-3] for i in range(len(csv_node.out_vars)): nodes[-len(csv_node.out_vars) + i].target = csv_node.out_vars[i] # get global array sizes by calling allreduce on chunk lens # TODO: get global size from C for arr in csv_node.out_vars: def f(A): return sdc.distributed_api.dist_reduce(len(A), np.int32(_op)) f_block = compile_to_numba_ir( f, {'sdc': sdc, 'np': np, '_op': sdc.distributed_api.Reduce_Type.Sum.value}, typingctx, (typemap[arr.name],), typemap, calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [arr]) nodes += f_block.body[:-2] size_var = nodes[-1].target dist_pass._array_sizes[arr.name] = [size_var] out, start_var, end_var = dist_pass._gen_1D_div( size_var, arr.scope, csv_node.loc, "$alloc", "get_node_portion", sdc.distributed_api.get_node_portion) dist_pass._array_starts[arr.name] = [start_var] dist_pass._array_counts[arr.name] = [end_var] nodes += out return nodes
def _gen_rolling_init(self, win_size, func, center): nodes = [] right_length = 0 scope = win_size.scope loc = win_size.loc right_length = ir.Var(scope, mk_unique_var('zero_var'), scope) nodes.append(ir.Assign(ir.Const(0, loc), right_length, win_size.loc)) def f(w): return -w + 1 f_block = compile_to_numba_ir(f, {}).blocks.popitem()[1] replace_arg_nodes(f_block, [win_size]) nodes.extend(f_block.body[:-2]) # remove none return left_length = nodes[-1].target if center: def f(w): return -(w // 2) f_block = compile_to_numba_ir(f, {}).blocks.popitem()[1] replace_arg_nodes(f_block, [win_size]) nodes.extend(f_block.body[:-2]) # remove none return left_length = nodes[-1].target def f(w): return (w // 2) f_block = compile_to_numba_ir(f, {}).blocks.popitem()[1] replace_arg_nodes(f_block, [win_size]) nodes.extend(f_block.body[:-2]) # remove none return right_length = nodes[-1].target def f(a, b): return ((a, b), ) f_block = compile_to_numba_ir(f, {}).blocks.popitem()[1] replace_arg_nodes(f_block, [left_length, right_length]) nodes.extend(f_block.body[:-2]) # remove none return win_tuple = nodes[-1].target index_offsets = [right_length] if func == 'apply': index_offsets = [left_length] def f(a): return (a, ) f_block = compile_to_numba_ir(f, {}).blocks.popitem()[1] replace_arg_nodes(f_block, index_offsets) nodes.extend(f_block.body[:-2]) # remove none return index_offsets = nodes[-1].target return index_offsets, win_tuple, nodes
def _run_pd_DatetimeIndex(self, assign, lhs, rhs): """transform pd.DatetimeIndex() call with string array argument """ kws = dict(rhs.kws) if 'data' in kws: data = kws['data'] if len(rhs.args) != 0: # pragma: no cover raise ValueError( "only data argument suppoted in pd.DatetimeIndex()") else: if len(rhs.args) != 1: # pragma: no cover raise ValueError( "data argument in pd.DatetimeIndex() expected") data = rhs.args[0] def f(str_arr): numba.parfor.init_prange() n = len(str_arr) S = numba.unsafe.ndarray.empty_inferred((n, )) for i in numba.parfor.internal_prange(n): S[i] = hpat.pd_timestamp_ext.parse_datetime_str(str_arr[i]) ret = S f_ir = compile_to_numba_ir( f, { 'hpat': hpat, 'numba': numba }, self.typingctx, (if_series_to_array_type(self.typemap[data.name]), ), self.typemap, self.calltypes) topo_order = find_topo_order(f_ir.blocks) f_ir.blocks[topo_order[-1]].body[-4].target = lhs replace_arg_nodes(f_ir.blocks[topo_order[0]], [data]) return f_ir.blocks
def _handle_np_fromfile(assign, lhs, rhs): """translate np.fromfile() to native """ # TODO: dtype in kws if len(rhs.args) != 2: # pragma: no cover raise ValueError("np.fromfile(): file name and dtype expected") # FIXME: import here since hio has hdf5 which might not be available from .. import hio import llvmlite.binding as ll ll.add_symbol('get_file_size', hio.get_file_size) ll.add_symbol('file_read', hio.file_read) ll.add_symbol('file_read_parallel', hio.file_read_parallel) _fname = rhs.args[0] _dtype = rhs.args[1] def fromfile_impl(fname, dtype): size = get_file_size(fname) dtype_size = get_dtype_size(dtype) A = np.empty(size // dtype_size, dtype=dtype) file_read(fname, A, size) read_arr = A f_block = compile_to_numba_ir( fromfile_impl, { 'np': np, 'get_file_size': get_file_size, 'file_read': file_read, 'get_dtype_size': get_dtype_size }).blocks.popitem()[1] replace_arg_nodes(f_block, [_fname, _dtype]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = lhs return nodes
def get_column_read_nodes(c_type, cvar, file_name, i): loc = cvar.loc func_text = ('def f(fname):\n col_size = get_column_size_parquet(fname, {})\n'. format(i)) # generate strings differently if c_type == string_array_type: # pass size for easier allocation and distributed analysis func_text += ' column = read_parquet_str(fname, {}, col_size)\n'.format( i) else: el_type = get_element_type(c_type.dtype) func_text += ' column = np.empty(col_size, dtype=np.{})\n'.format( el_type) func_text += ' status = read_parquet(fname, {}, column, np.int32({}))\n'.format( i, _type_to_pq_dtype_number[el_type]) loc_vars = {} exec(func_text, {}, loc_vars) size_func = loc_vars['f'] _, f_block = compile_to_numba_ir(size_func, {'get_column_size_parquet': get_column_size_parquet, 'read_parquet': read_parquet, 'read_parquet_str': read_parquet_str, 'np': np, 'StringArray': StringArray}).blocks.popitem() replace_arg_nodes(f_block, [file_name]) out_nodes = f_block.body[:-3] for stmt in reversed(out_nodes): if stmt.target.name.startswith("column"): assign = ir.Assign(stmt.target, cvar, loc) break out_nodes.append(assign) return out_nodes
def _handle_empty_like(self, assign, lhs, rhs): # B = empty_like(A) -> B = empty(len(A), dtype) in_arr = rhs.args[0] if self.typemap[in_arr.name].ndim == 1: # generate simpler len() for 1D case def f(_in_arr): # pragma: no cover _alloc_size = len(_in_arr) _out_arr = np.empty(_alloc_size, _in_arr.dtype) else: def f(_in_arr): # pragma: no cover _alloc_size = _in_arr.shape _out_arr = np.empty(_alloc_size, _in_arr.dtype) f_block = compile_to_numba_ir( f, { 'np': np }, self.typingctx, (if_series_to_array_type(self.typemap[in_arr.name]), ), self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [in_arr]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = assign.target return nodes
def gen_init_xenon(address, dset_name): # TODO: support non-constant address/dset_name func_text = ('def f():\n connect_t = xe_connect(unicode_to_char_ptr("{}"))\n'.format(address)) func_text += ' dset_t = xe_open(connect_t, unicode_to_char_ptr("{}"))\n'.format(dset_name) loc_vars = {} exec(func_text, {}, loc_vars) init_func = loc_vars['f'] f_block = compile_to_numba_ir(init_func, {'xe_connect': xe_connect, 'unicode_to_char_ptr': unicode_to_char_ptr, 'xe_open': xe_open}).blocks.popitem()[1] connect_var = None dset_t_var = None out_nodes = f_block.body[:-3] for stmt in reversed(out_nodes): if stmt.target.name.startswith("connect_t"): connect_var = stmt.target if stmt.target.name.startswith("dset_t"): dset_t_var = stmt.target assert connect_var is not None and dset_t_var is not None return out_nodes, connect_var, dset_t_var
def _handle_str_contains(self, lhs, rhs, assign, call_table): fname = guard(find_callname, self.func_ir, rhs) if fname is None: return None if fname == ('str_contains_regex', 'hpat.hiframes_api'): comp_func = 'hpat.str_ext.contains_regex' elif fname == ('str_contains_noregex', 'hpat.hiframes_api'): comp_func = 'hpat.str_ext.contains_noregex' else: return None str_arr = rhs.args[0] pat = rhs.args[1] func_text = 'def f(str_arr, pat):\n' func_text += ' l = len(str_arr)\n' func_text += ' S = np.empty(l, dtype=np.bool_)\n' func_text += ' for i in numba.parfor.internal_prange(l):\n' func_text += ' S[i] = {}(str_arr[i], pat)\n'.format(comp_func) loc_vars = {} exec(func_text, {}, loc_vars) f = loc_vars['f'] f_blocks = compile_to_numba_ir( f, { 'numba': numba, 'np': np, 'hpat': hpat }, self.typingctx, (self.typemap[str_arr.name], self.typemap[pat.name]), self.typemap, self.calltypes).blocks replace_arg_nodes(f_blocks[min(f_blocks.keys())], [str_arr, pat]) # replace call with result of parfor (S) # S is target of last statement in 1st block of f assign.value = f_blocks[min(f_blocks.keys())].body[-2].target return (f_blocks, [assign])
def _handle_fix_df_array(self, lhs, rhs, assign, call_table): # arr = fix_df_array(col) -> arr=col if col is array if (rhs.op == 'call' and rhs.func.name in call_table and call_table[rhs.func.name] == ['fix_df_array', 'hiframes_api', hpat] and isinstance(self.typemap[rhs.args[0].name], (types.Array, StringArrayType))): assign.value = rhs.args[0] return [assign] # arr = fix_rolling_array(col) -> arr=col if col is float array if (rhs.op == 'call' and rhs.func.name in call_table and call_table[rhs.func.name] == ['fix_rolling_array', 'hiframes_api', hpat]): in_arr = rhs.args[0] if isinstance(self.typemap[in_arr.name].dtype, types.Float): assign.value = rhs.args[0] return [assign] else: def f(column): a = column.astype(np.float64) f_block = compile_to_numba_ir( f, { 'hpat': hpat, 'np': np }, self.typingctx, (self.typemap[in_arr.name], ), self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [in_arr]) nodes = f_block.body[:-3] nodes[-1].target = assign.target return nodes return None
def get_column_read_nodes(c_type, cvar, xe_connect_var, xe_dset_var, i, schema_arr_var): loc = cvar.loc func_text = ('def f(xe_connect_var, xe_dset_var, schema_arr):\n') func_text = (' col_size = get_column_size_xenon(xe_connect_var, xe_dset_var, {})\n'. format(i)) # func_text += ' print(col_size)\n' # generate strings differently since upfront allocation is not possible if c_type == string_array_type: # pass size for easier allocation and distributed analysis func_text += ' column = read_xenon_str(xe_connect_var, xe_dset_var, {}, col_size, schema_arr)\n'.format(i) else: el_type = get_element_type(c_type.dtype) func_text += ' column = np.empty(col_size, dtype=np.{})\n'.format(el_type) func_text += ' status = read_xenon_col(xe_connect_var, xe_dset_var, {}, column, schema_arr)\n'.format(i) loc_vars = {} exec(func_text, {}, loc_vars) size_func = loc_vars['f'] _, f_block = compile_to_numba_ir(size_func, {'get_column_size_xenon': get_column_size_xenon, 'read_xenon_col': read_xenon_col, 'read_xenon_str': read_xenon_str, 'np': np, }).blocks.popitem() replace_arg_nodes(f_block, [xe_connect_var, xe_dset_var, schema_arr_var]) out_nodes = f_block.body[:-3] for stmt in reversed(out_nodes): if isinstance(stmt, ir.Assign) and stmt.target.name.startswith("column"): assign = ir.Assign(stmt.target, cvar, loc) break out_nodes.append(assign) return out_nodes
def _handle_empty_like(self, lhs, rhs, assign, call_table): # B = empty_like(A) -> B = empty(len(A), dtype) if (rhs.op == 'call' and rhs.func.name in call_table and call_table[rhs.func.name] == ['empty_like', np]): in_arr = rhs.args[0] if self.typemap[in_arr.name].ndim == 1: # generate simpler len() for 1D case def f(_in_arr): # pragma: no cover _alloc_size = len(_in_arr) _out_arr = np.empty(_alloc_size, _in_arr.dtype) else: def f(_in_arr): # pragma: no cover _alloc_size = _in_arr.shape _out_arr = np.empty(_alloc_size, _in_arr.dtype) f_block = compile_to_numba_ir(f, { 'np': np }, self.typingctx, (self.typemap[in_arr.name], ), self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [in_arr]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = assign.target return nodes return None
def _handle_str_contains(self, lhs, rhs): """ Handle string contains like: B = df.column.str.contains('oo*', regex=True) """ func_def = guard(get_definition, self.func_ir, rhs.func) assert func_def is not None # rare case where function variable is assigned to a new variable if isinstance(func_def, ir.Var): rhs.func = func_def return self._handle_str_contains(lhs, rhs) str_col = guard(self._get_str_contains_col, func_def) if str_col is None: return None kws = dict(rhs.kws) pat = rhs.args[0] regex = True # default regex arg is True if 'regex' in kws: regex = get_constant(self.func_ir, kws['regex'], regex) if regex: def f(str_arr, pat): hpat.hiframes_api.str_contains_regex(str_arr, pat) else: def f(str_arr, pat): hpat.hiframes_api.str_contains_noregex(str_arr, pat) f_block = compile_to_numba_ir(f, {'hpat': hpat}).blocks.popitem()[1] replace_arg_nodes(f_block, [str_col, pat]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = lhs return nodes
def _handle_df_col_filter(self, lhs_name, rhs, assign): # find df['col2'] = df['col1'][arr] # since columns should have the same size, output is filled with NaNs # TODO: check for float, make sure col1 and col2 are in the same df if (rhs.op == 'getitem' and rhs.value.name in self.df_cols and lhs_name in self.df_cols and self.is_bool_arr(rhs.index.name)): lhs = assign.target in_arr = rhs.value index_var = rhs.index f_blocks = compile_to_numba_ir( _column_filter_impl_float, { 'numba': numba, 'np': np }, self.typingctx, (self.typemap[lhs.name], self.typemap[in_arr.name], self.typemap[index_var.name]), self.typemap, self.calltypes).blocks first_block = min(f_blocks.keys()) replace_arg_nodes(f_blocks[first_block], [lhs, in_arr, index_var]) alloc_nodes = gen_np_call('empty_like', np.empty_like, lhs, [in_arr], self.typingctx, self.typemap, self.calltypes) f_blocks[ first_block].body = alloc_nodes + f_blocks[first_block].body return f_blocks
def gen_stencil_call(in_arr, out_arr, kernel_func, index_offsets, fir_globals, other_args=None, options=None): if other_args is None: other_args = [] if options is None: options = {} if index_offsets != [0]: options['index_offsets'] = index_offsets scope = in_arr.scope loc = in_arr.loc stencil_nodes = [] stencil_nodes += gen_empty_like(in_arr, out_arr) kernel_var = ir.Var(scope, mk_unique_var("kernel_var"), scope) if not isinstance(kernel_func, ir.Expr): kernel_func = ir.Expr.make_function("kernel", kernel_func.__code__, kernel_func.__closure__, kernel_func.__defaults__, loc) stencil_nodes.append(ir.Assign(kernel_func, kernel_var, loc)) def f(A, B, f): numba.stencil(f)(A, out=B) f_block = compile_to_numba_ir(f, {'numba': numba}).blocks.popitem()[1] replace_arg_nodes(f_block, [in_arr, out_arr, kernel_var]) stencil_nodes += f_block.body[:-3] # remove none return setup_call = stencil_nodes[-2].value stencil_call = stencil_nodes[-1].value setup_call.kws = list(options.items()) stencil_call.args += other_args return stencil_nodes
def _gen_column_shift_pct(self, out_var, args, col_var, func): loc = col_var.loc if func == 'pct_change': shift_const = 1 if args: shift_const = get_constant(self.func_ir, args[0]) assert shift_const is not NOT_CONSTANT func_text = 'def g(a):\n return (a[0]-a[{}])/a[{}]\n'.format( -shift_const, -shift_const) else: assert func == 'shift' shift_const = get_constant(self.func_ir, args[0]) assert shift_const is not NOT_CONSTANT func_text = 'def g(a):\n return a[{}]\n'.format(-shift_const) loc_vars = {} exec(func_text, {}, loc_vars) kernel_func = loc_vars['g'] index_offsets = [0] fir_globals = self.func_ir.func_id.func.__globals__ stencil_nodes = gen_stencil_call(col_var, out_var, kernel_func, index_offsets, fir_globals) border_text = 'def f(A):\n A[0:{}] = np.nan\n'.format(shift_const) loc_vars = {} exec(border_text, {}, loc_vars) border_func = loc_vars['f'] f_blocks = compile_to_numba_ir(border_func, {'np': np}).blocks block = f_blocks[min(f_blocks.keys())] replace_arg_nodes(block, [out_var]) setitem_nodes = block.body[:-3] # remove none return return stencil_nodes + setitem_nodes
def _gen_col_describe(self, out_var, args, col_var): def f(A): a_count = hpat.hiframes_api.count(A) a_min = np.min(A) a_max = np.max(A) a_mean = hpat.hiframes_api.mean(A) a_std = hpat.hiframes_api.var(A)**0.5 q25 = hpat.hiframes_api.quantile(A, .25) q50 = hpat.hiframes_api.quantile(A, .5) q75 = hpat.hiframes_api.quantile(A, .75) s = "count "+str(a_count)+"\n"\ "mean "+str(a_mean)+"\n"\ "std "+str(a_std)+"\n"\ "min "+str(a_min)+"\n"\ "25% "+str(q25)+"\n"\ "50% "+str(q50)+"\n"\ "75% "+str(q75)+"\n"\ "max "+str(a_max)+"\n" f_block = compile_to_numba_ir(f, { 'hpat': hpat, 'np': np }).blocks.popitem()[1] replace_arg_nodes(f_block, [col_var]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = out_var return nodes
def _add_offset_to_slice(self, slice_var, offset_var, out_nodes, scope, loc): if isinstance(slice_var, slice): f_text = """def f(offset): return slice({} + offset, {} + offset) """.format(slice_var.start, slice_var.stop) loc = {} exec_(f_text, {}, loc) f = loc['f'] args = [offset_var] arg_typs = (types.intp,) else: def f(old_slice, offset): return slice(old_slice.start + offset, old_slice.stop + offset) args = [slice_var, offset_var] slice_type = self.typemap[slice_var.name] arg_typs = (slice_type, types.intp,) _globals = self.func_ir.func_id.func.__globals__ f_ir = compile_to_numba_ir(f, _globals, self.typingctx, arg_typs, self.typemap, self.calltypes) _, block = f_ir.blocks.popitem() replace_arg_nodes(block, args) new_index = block.body[-2].value.value out_nodes.extend(block.body[:-2]) # ignore return nodes return new_index
def _copy_array_nodes(var, nodes, typingctx, typemap, calltypes): def _impl(arr): return arr.copy() f_block = compile_to_numba_ir(_impl, {}, typingctx, (typemap[var.name], ), typemap, calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [var]) nodes += f_block.body[:-2] return nodes[-1].target
def _gen_col_var(self, out_var, args, col_var): def f(A): # pragma: no cover s = hpat.hiframes_api.var(A) f_block = compile_to_numba_ir(f, {'hpat': hpat}).blocks.popitem()[1] replace_arg_nodes(f_block, [col_var]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = out_var return nodes
def _gen_col_quantile(self, out_var, args, col_var): def f(A, q): s = hpat.hiframes_api.quantile(A, q) f_block = compile_to_numba_ir(f, {'hpat': hpat}).blocks.popitem()[1] replace_arg_nodes(f_block, [col_var, args[0]]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = out_var return nodes
def gen_close_xenon(connect_var, dset_t_var): # def close_func(connect_var, dset_t_var): s = xe_close(connect_var, dset_t_var) f_block = compile_to_numba_ir(close_func, {'xe_close': xe_close}).blocks.popitem()[1] replace_arg_nodes(f_block, [connect_var, dset_t_var]) out_nodes = f_block.body[:-3] return out_nodes
def _get_stencil_start_ind(self, start_length, gen_nodes, scope, loc): if isinstance(start_length, int): return abs(min(start_length, 0)) def get_start_ind(s_length): return abs(min(s_length, 0)) f_ir = compile_to_numba_ir(get_start_ind, {}, self.typingctx, (types.intp,), self.typemap, self.calltypes) assert len(f_ir.blocks) == 1 block = f_ir.blocks.popitem()[1] replace_arg_nodes(block, [start_length]) gen_nodes += block.body[:-2] ret_var = block.body[-2].value.value return ret_var
def _fix_rolling_array(self, col_var, func): """ for integers and bools, the output should be converted to float64 """ # TODO: check all possible funcs def f(arr): df_arr = hpat.hiframes_api.fix_rolling_array(arr) f_block = compile_to_numba_ir(f, {'hpat': hpat}).blocks.popitem()[1] replace_arg_nodes(f_block, [col_var]) nodes = f_block.body[:-3] # remove none return new_col_var = nodes[-1].target return new_col_var, nodes
def _fix_df_arrays(self, items_list): nodes = [] new_list = [] for item in items_list: col_varname = item[0] col_arr = item[1] def f(arr): df_arr = hpat.hiframes_api.fix_df_array(arr) f_block = compile_to_numba_ir(f, {'hpat': hpat}).blocks.popitem()[1] replace_arg_nodes(f_block, [col_arr]) nodes += f_block.body[:-3] # remove none return new_col_arr = nodes[-1].target new_list.append((col_varname, new_col_arr)) return nodes, new_list
def _gen_col_std(self, out_var, args, col_var): loc = out_var.loc scope = out_var.scope # calculate var() first var_var = ir.Var(scope, mk_unique_var("var_val"), loc) v_nodes = self._gen_col_var(var_var, args, col_var) def f(a): a**0.5 s_block = compile_to_numba_ir(f, {}).blocks.popitem()[1] replace_arg_nodes(s_block, [var_var]) s_nodes = s_block.body[:-3] assert len(s_nodes) == 3 s_nodes[-1].target = out_var return v_nodes + s_nodes
def gen_parquet_read(self, file_name): import pyarrow.parquet as pq fname_def = guard(get_definition, self.func_ir, file_name) if isinstance(fname_def, ir.Const): assert isinstance(fname_def.value, str) file_name_str = fname_def.value col_names, col_types = parquet_file_schema(file_name_str) scope = file_name.scope loc = file_name.loc out_nodes = [] col_items = [] for i, cname in enumerate(col_names): # get column type from schema c_type = col_types[i] # create a variable for column and assign type varname = mk_unique_var(cname) self.locals[varname] = c_type cvar = ir.Var(scope, varname, loc) col_items.append((cname, cvar)) size_func_text = ( 'def f():\n col_size = get_column_size_parquet("{}", {})\n' .format(file_name_str, i)) size_func_text += ' column = np.empty(col_size, dtype=np.{})\n'.format( c_type.dtype) size_func_text += ' status = read_parquet("{}", {}, column)\n'.format( file_name_str, i) loc_vars = {} exec(size_func_text, {}, loc_vars) size_func = loc_vars['f'] _, f_block = compile_to_numba_ir( size_func, { 'get_column_size_parquet': get_column_size_parquet, 'read_parquet': read_parquet, 'np': np }).blocks.popitem() out_nodes += f_block.body[:-3] for stmt in out_nodes: if stmt.target.name.startswith("column"): assign = ir.Assign(stmt.target, cvar, loc) break out_nodes.append(assign) return col_items, out_nodes raise ValueError("Parquet schema not available")
def _handle_string_array_expr(self, lhs, rhs, assign): # convert str_arr==str into parfor if (rhs.op == 'binop' and rhs.fn in ['==', '!=', '>=', '>', '<=', '<'] and (is_str_arr_typ(self.typemap[rhs.lhs.name]) or is_str_arr_typ(self.typemap[rhs.rhs.name]))): arg1 = rhs.lhs arg2 = rhs.rhs arg1_access = 'A' arg2_access = 'B' len_call = 'len(A)' if is_str_arr_typ(self.typemap[arg1.name]): arg1_access = 'A[i]' # replace type now for correct typing of len, etc. self.typemap.pop(arg1.name) self.typemap[arg1.name] = string_array_type if is_str_arr_typ(self.typemap[arg2.name]): arg1_access = 'B[i]' len_call = 'len(B)' self.typemap.pop(arg2.name) self.typemap[arg2.name] = string_array_type func_text = 'def f(A, B):\n' func_text += ' l = {}\n'.format(len_call) func_text += ' S = np.empty(l, dtype=np.bool_)\n' func_text += ' for i in numba.parfor.internal_prange(l):\n' func_text += ' S[i] = {} {} {}\n'.format( arg1_access, rhs.fn, arg2_access) loc_vars = {} exec(func_text, {}, loc_vars) f = loc_vars['f'] f_blocks = compile_to_numba_ir( f, { 'numba': numba, 'np': np }, self.typingctx, (if_series_to_array_type(self.typemap[arg1.name]), if_series_to_array_type(self.typemap[arg2.name])), self.typemap, self.calltypes).blocks replace_arg_nodes(f_blocks[min(f_blocks.keys())], [arg1, arg2]) # replace == expression with result of parfor (S) # S is target of last statement in 1st block of f assign.value = f_blocks[min(f_blocks.keys())].body[-2].target return (f_blocks, [assign]) return None
def handle_possible_h5_read(self, assign, lhs, rhs): tp = self._get_h5_type(lhs, rhs) if tp is not None: dtype_str = str(tp.dtype) func_text = "def _h5_read_impl(dset, index):\n" # TODO: index arg? func_text += " arr = hpat.io.pio_api.h5_read_dummy(dset, {}, '{}', index)\n".format(tp.ndim, dtype_str) loc_vars = {} exec(func_text, {}, loc_vars) _h5_read_impl = loc_vars['_h5_read_impl'] f_block = compile_to_numba_ir(_h5_read_impl, {'hpat': hpat}).blocks.popitem()[1] index_var = rhs.index if rhs.op == 'getitem' else rhs.index_var replace_arg_nodes(f_block, [rhs.value, index_var]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = assign.target return nodes return None
def get_column_read_nodes(c_type, cvar, arrow_readers_var, i): loc = cvar.loc func_text = 'def f(arrow_readers):\n' func_text += ' col_size = get_column_size_parquet(arrow_readers, {})\n'.format( i) # generate strings differently if c_type == string_type: # pass size for easier allocation and distributed analysis func_text += ' column = read_parquet_str(arrow_readers, {}, col_size)\n'.format( i) else: el_type = get_element_type(c_type) if el_type == repr(types.NPDatetime('ns')): func_text += ' column_tmp = np.empty(col_size, dtype=np.int64)\n' # TODO: fix alloc func_text += ' column = sdc.hiframes.api.ts_series_to_arr_typ(column_tmp)\n' else: func_text += ' column = np.empty(col_size, dtype=np.{})\n'.format( el_type) func_text += ' status = read_parquet(arrow_readers, {}, column, np.int32({}))\n'.format( i, _type_to_pq_dtype_number[el_type]) loc_vars = {} exec(func_text, {'sdc': sdc, 'np': np}, loc_vars) size_func = loc_vars['f'] _, f_block = compile_to_numba_ir( size_func, { 'get_column_size_parquet': get_column_size_parquet, 'read_parquet': read_parquet, 'read_parquet_str': read_parquet_str, 'np': np, 'sdc': sdc, 'StringArray': StringArray }).blocks.popitem() replace_arg_nodes(f_block, [arrow_readers_var]) out_nodes = f_block.body[:-3] for stmt in reversed(out_nodes): if stmt.target.name.startswith("column"): assign = ir.Assign(stmt.target, cvar, loc) break out_nodes.append(assign) return out_nodes
def _gen_rebalances(self, rebalance_arrs, blocks): # for block in blocks.values(): new_body = [] for inst in block.body: # TODO: handle hiframes filter etc. if isinstance(inst, Parfor): self._gen_rebalances(rebalance_arrs, {0: inst.init_block}) self._gen_rebalances(rebalance_arrs, inst.loop_body) if isinstance( inst, ir.Assign) and inst.target.name in rebalance_arrs: out_arr = inst.target self.func_ir._definitions[out_arr.name].remove(inst.value) # hold inst results in tmp array tmp_arr = ir.Var(out_arr.scope, mk_unique_var("rebalance_tmp"), out_arr.loc) self.typemap[tmp_arr.name] = self.typemap[out_arr.name] inst.target = tmp_arr nodes = [inst] def f(in_arr): # pragma: no cover out_a = hpat.distributed_api.rebalance_array(in_arr) f_block = compile_to_numba_ir( f, { 'hpat': hpat }, self.typingctx, (self.typemap[tmp_arr.name], ), self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [tmp_arr]) nodes += f_block.body[:-3] # remove none return nodes[-1].target = out_arr # update definitions dumm_block = ir.Block(out_arr.scope, out_arr.loc) dumm_block.body = nodes build_definitions({0: dumm_block}, self.func_ir._definitions) new_body += nodes else: new_body.append(inst) block.body = new_body