def _add_offset_to_slice(self, slice_var, offset_var, out_nodes, scope, loc): if isinstance(slice_var, slice): f_text = """def f(offset): return slice({} + offset, {} + offset) """.format(slice_var.start, slice_var.stop) loc = {} exec(f_text, {}, loc) f = loc['f'] args = [offset_var] arg_typs = (types.intp, ) else: def f(old_slice, offset): return slice(old_slice.start + offset, old_slice.stop + offset) args = [slice_var, offset_var] slice_type = self.typemap[slice_var.name] arg_typs = ( slice_type, types.intp, ) _globals = self.func_ir.func_id.func.__globals__ f_ir = compile_to_numba_ir(f, _globals, self.typingctx, arg_typs, self.typemap, self.calltypes) _, block = f_ir.blocks.popitem() replace_arg_nodes(block, args) new_index = block.body[-2].value.value out_nodes.extend(block.body[:-2]) # ignore return nodes return new_index
def csv_distributed_run(csv_node, array_dists, typemap, calltypes, typingctx, targetctx, dist_pass): parallel = False n_cols = len(csv_node.out_vars) # TODO: rebalance if output distributions are 1D instead of 1D_Var # get column variables arg_names = ", ".join("arr" + str(i) for i in range(n_cols)) func_text = "def csv_impl(fname):\n" func_text += " ({},) = _csv_reader_py(fname)\n".format(arg_names) # print(func_text) loc_vars = {} exec(func_text, {}, loc_vars) csv_impl = loc_vars['csv_impl'] csv_reader_py = _gen_csv_reader_py(csv_node.df_colnames, csv_node.out_types, csv_node.usecols, csv_node.sep, typingctx, targetctx, parallel, csv_node.skiprows) f_block = compile_to_numba_ir(csv_impl, { '_csv_reader_py': csv_reader_py }, typingctx, (string_type, ), typemap, calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [csv_node.file_name]) nodes = f_block.body[:-3] for i in range(len(csv_node.out_vars)): nodes[-len(csv_node.out_vars) + i].target = csv_node.out_vars[i] # get global array sizes by calling allreduce on chunk lens # TODO: get global size from C for arr in csv_node.out_vars: def f(A): return sdc.distributed_api.dist_reduce(len(A), np.int32(_op)) f_block = compile_to_numba_ir( f, { 'sdc': sdc, 'np': np, '_op': sdc.distributed_api.Reduce_Type.Sum.value }, typingctx, (typemap[arr.name], ), typemap, calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [arr]) nodes += f_block.body[:-2] size_var = nodes[-1].target dist_pass._array_sizes[arr.name] = [size_var] out, start_var, end_var = dist_pass._gen_1D_div( size_var, arr.scope, csv_node.loc, "$alloc", "get_node_portion", sdc.distributed_api.get_node_portion) dist_pass._array_starts[arr.name] = [start_var] dist_pass._array_counts[arr.name] = [end_var] nodes += out return nodes
def _get_stencil_start_ind(self, start_length, gen_nodes, scope, loc): if isinstance(start_length, int): return abs(min(start_length, 0)) def get_start_ind(s_length): return abs(min(s_length, 0)) f_ir = compile_to_numba_ir(get_start_ind, {}, self.typingctx, (types.intp,), self.typemap, self.calltypes) assert len(f_ir.blocks) == 1 block = f_ir.blocks.popitem()[1] replace_arg_nodes(block, [start_length]) gen_nodes += block.body[:-2] ret_var = block.body[-2].value.value return ret_var
def get_column_read_nodes(c_type, cvar, arrow_readers_var, i): loc = cvar.loc func_text = 'def f(arrow_readers):\n' func_text += ' col_size = get_column_size_parquet(arrow_readers, {})\n'.format( i) # generate strings differently if c_type == string_type: # pass size for easier allocation and distributed analysis func_text += ' column = read_parquet_str(arrow_readers, {}, col_size)\n'.format( i) else: el_type = get_element_type(c_type) if el_type == repr(types.NPDatetime('ns')): func_text += ' column_tmp = np.empty(col_size, dtype=np.int64)\n' # TODO: fix alloc func_text += ' column = sdc.hiframes.api.ts_series_to_arr_typ(column_tmp)\n' else: func_text += ' column = np.empty(col_size, dtype=np.{})\n'.format( el_type) func_text += ' status = read_parquet(arrow_readers, {}, column, np.int32({}))\n'.format( i, _type_to_pq_dtype_number[el_type]) loc_vars = {} exec(func_text, {'sdc': sdc, 'np': np}, loc_vars) size_func = loc_vars['f'] _, f_block = compile_to_numba_ir( size_func, { 'get_column_size_parquet': get_column_size_parquet, 'read_parquet': read_parquet, 'read_parquet_str': read_parquet_str, 'np': np, 'sdc': sdc, 'StringArray': StringArray }).blocks.popitem() replace_arg_nodes(f_block, [arrow_readers_var]) out_nodes = f_block.body[:-3] for stmt in reversed(out_nodes): if stmt.target.name.startswith("column"): assign = ir.Assign(stmt.target, cvar, loc) break out_nodes.append(assign) return out_nodes
def _gen_rebalances(self, rebalance_arrs, blocks): # for block in blocks.values(): new_body = [] for inst in block.body: # TODO: handle hiframes filter etc. if isinstance(inst, Parfor): self._gen_rebalances(rebalance_arrs, {0: inst.init_block}) self._gen_rebalances(rebalance_arrs, inst.loop_body) if isinstance( inst, ir.Assign) and inst.target.name in rebalance_arrs: out_arr = inst.target self.func_ir._definitions[out_arr.name].remove(inst.value) # hold inst results in tmp array tmp_arr = ir.Var(out_arr.scope, mk_unique_var("rebalance_tmp"), out_arr.loc) self.typemap[tmp_arr.name] = self.typemap[out_arr.name] inst.target = tmp_arr nodes = [inst] def f(in_arr): # pragma: no cover out_a = sdc.distributed_api.rebalance_array(in_arr) f_block = compile_to_numba_ir( f, { 'sdc': sdc }, self.typingctx, (self.typemap[tmp_arr.name], ), self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [tmp_arr]) nodes += f_block.body[:-3] # remove none return nodes[-1].target = out_arr # update definitions dumm_block = ir.Block(out_arr.scope, out_arr.loc) dumm_block.body = nodes build_definitions({0: dumm_block}, self.func_ir._definitions) new_body += nodes else: new_body.append(inst) block.body = new_body
def _handle_np_fromfile(assign, lhs, rhs): """translate np.fromfile() to native """ # TODO: dtype in kws if len(rhs.args) != 2: # pragma: no cover raise ValueError("np.fromfile(): file name and dtype expected") # FIXME: import here since hio has hdf5 which might not be available from .. import hio from .. import transport_seq as transport import llvmlite.binding as ll ll.add_symbol('get_file_size', transport.get_file_size) ll.add_symbol('file_read', hio.file_read) ll.add_symbol('file_read_parallel', transport.file_read_parallel) _fname = rhs.args[0] _dtype = rhs.args[1] def fromfile_impl(fname, dtype): size = get_file_size(fname) dtype_size = get_dtype_size(dtype) A = np.empty(size // dtype_size, dtype=dtype) file_read(fname, A, size) read_arr = A f_block = compile_to_numba_ir( fromfile_impl, { 'np': np, 'get_file_size': get_file_size, 'file_read': file_read, 'get_dtype_size': get_dtype_size }).blocks.popitem()[1] replace_arg_nodes(f_block, [_fname, _dtype]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = lhs return nodes
def gen_parquet_read(self, file_name, lhs): scope = file_name.scope loc = file_name.loc table_types = None # lhs is temporary and will possibly be assigned to user variable assert lhs.name.startswith('$') if lhs.name in self.reverse_copies and self.reverse_copies[ lhs.name] in self.locals: table_types = self.locals[self.reverse_copies[lhs.name]] self.locals.pop(self.reverse_copies[lhs.name]) convert_types = {} # user-specified type conversion if lhs.name in self.reverse_copies and (self.reverse_copies[lhs.name] + ':convert') in self.locals: convert_types = self.locals[self.reverse_copies[lhs.name] + ':convert'] self.locals.pop(self.reverse_copies[lhs.name] + ':convert') if table_types is None: fname_def = guard(get_definition, self.func_ir, file_name) if (not isinstance(fname_def, (ir.Const, ir.Global, ir.FreeVar)) or not isinstance(fname_def.value, str)): raise ValueError("Parquet schema not available") file_name_str = fname_def.value col_names, col_types = parquet_file_schema(file_name_str) # remove Pandas index if exists # TODO: handle index properly when indices are supported _rm_pd_index(col_names, col_types) else: col_names = list(table_types.keys()) col_types = list(table_types.values()) out_nodes = [] # get arrow readers once def init_arrow_readers(fname): arrow_readers = get_arrow_readers(unicode_to_char_ptr(fname)) f_block = compile_to_numba_ir( init_arrow_readers, { 'get_arrow_readers': _get_arrow_readers, 'unicode_to_char_ptr': unicode_to_char_ptr, }).blocks.popitem()[1] replace_arg_nodes(f_block, [file_name]) out_nodes += f_block.body[:-3] arrow_readers_var = out_nodes[-1].target col_arrs = [] for i, cname in enumerate(col_names): # get column type from schema c_type = col_types[i] if cname in convert_types: c_type = convert_types[cname].dtype # create a variable for column and assign type varname = mk_unique_var(cname) #self.locals[varname] = c_type cvar = ir.Var(scope, varname, loc) col_arrs.append(cvar) out_nodes += get_column_read_nodes(c_type, cvar, arrow_readers_var, i) # delete arrow readers def cleanup_arrow_readers(readers): s = del_arrow_readers(readers) f_block = compile_to_numba_ir(cleanup_arrow_readers, { 'del_arrow_readers': _del_arrow_readers, }).blocks.popitem()[1] replace_arg_nodes(f_block, [arrow_readers_var]) out_nodes += f_block.body[:-3] return col_names, col_arrs, out_nodes