Exemple #1
0
def csv_distributed_run(csv_node, array_dists, typemap, calltypes, typingctx,
                        targetctx, dist_pass):
    parallel = False

    n_cols = len(csv_node.out_vars)
    # TODO: rebalance if output distributions are 1D instead of 1D_Var
    # get column variables
    arg_names = ", ".join("arr" + str(i) for i in range(n_cols))
    func_text = "def csv_impl(fname):\n"
    func_text += "    ({},) = _csv_reader_py(fname)\n".format(arg_names)
    # print(func_text)

    loc_vars = {}
    exec(func_text, {}, loc_vars)
    csv_impl = loc_vars['csv_impl']

    csv_reader_py = _gen_csv_reader_py(csv_node.df_colnames,
                                       csv_node.out_types, csv_node.usecols,
                                       csv_node.sep, typingctx, targetctx,
                                       parallel, csv_node.skiprows)

    f_block = compile_to_numba_ir(csv_impl, {
        '_csv_reader_py': csv_reader_py
    }, typingctx, (string_type, ), typemap, calltypes).blocks.popitem()[1]
    replace_arg_nodes(f_block, [csv_node.file_name])
    nodes = f_block.body[:-3]
    for i in range(len(csv_node.out_vars)):
        nodes[-len(csv_node.out_vars) + i].target = csv_node.out_vars[i]

    # get global array sizes by calling allreduce on chunk lens
    # TODO: get global size from C
    for arr in csv_node.out_vars:

        def f(A):
            return sdc.distributed_api.dist_reduce(len(A), np.int32(_op))

        f_block = compile_to_numba_ir(
            f, {
                'sdc': sdc,
                'np': np,
                '_op': sdc.distributed_api.Reduce_Type.Sum.value
            }, typingctx, (typemap[arr.name], ), typemap,
            calltypes).blocks.popitem()[1]
        replace_arg_nodes(f_block, [arr])
        nodes += f_block.body[:-2]
        size_var = nodes[-1].target
        dist_pass._array_sizes[arr.name] = [size_var]
        out, start_var, end_var = dist_pass._gen_1D_div(
            size_var, arr.scope, csv_node.loc, "$alloc", "get_node_portion",
            sdc.distributed_api.get_node_portion)
        dist_pass._array_starts[arr.name] = [start_var]
        dist_pass._array_counts[arr.name] = [end_var]
        nodes += out

    return nodes
Exemple #2
0
    def _add_offset_to_slice(self, slice_var, offset_var, out_nodes, scope,
                             loc):
        if isinstance(slice_var, slice):
            f_text = """def f(offset):
                return slice({} + offset, {} + offset)
            """.format(slice_var.start, slice_var.stop)
            loc = {}
            exec(f_text, {}, loc)
            f = loc['f']
            args = [offset_var]
            arg_typs = (types.intp, )
        else:

            def f(old_slice, offset):
                return slice(old_slice.start + offset, old_slice.stop + offset)

            args = [slice_var, offset_var]
            slice_type = self.typemap[slice_var.name]
            arg_typs = (
                slice_type,
                types.intp,
            )
        _globals = self.func_ir.func_id.func.__globals__
        f_ir = compile_to_numba_ir(f, _globals, self.typingctx, arg_typs,
                                   self.typemap, self.calltypes)
        _, block = f_ir.blocks.popitem()
        replace_arg_nodes(block, args)
        new_index = block.body[-2].value.value
        out_nodes.extend(block.body[:-2])  # ignore return nodes
        return new_index
Exemple #3
0
 def _get_stencil_start_ind(self, start_length, gen_nodes, scope, loc):
     if isinstance(start_length, int):
         return abs(min(start_length, 0))
     def get_start_ind(s_length):
         return abs(min(s_length, 0))
     f_ir = compile_to_numba_ir(get_start_ind, {}, self.typingctx,
                              (types.intp,), self.typemap, self.calltypes)
     assert len(f_ir.blocks) == 1
     block = f_ir.blocks.popitem()[1]
     replace_arg_nodes(block, [start_length])
     gen_nodes += block.body[:-2]
     ret_var = block.body[-2].value.value
     return ret_var
Exemple #4
0
def get_column_read_nodes(c_type, cvar, arrow_readers_var, i):

    loc = cvar.loc

    func_text = 'def f(arrow_readers):\n'
    func_text += '  col_size = get_column_size_parquet(arrow_readers, {})\n'.format(
        i)
    # generate strings differently
    if c_type == string_type:
        # pass size for easier allocation and distributed analysis
        func_text += '  column = read_parquet_str(arrow_readers, {}, col_size)\n'.format(
            i)
    else:
        el_type = get_element_type(c_type)
        if el_type == repr(types.NPDatetime('ns')):
            func_text += '  column_tmp = np.empty(col_size, dtype=np.int64)\n'
            # TODO: fix alloc
            func_text += '  column = sdc.hiframes.api.ts_series_to_arr_typ(column_tmp)\n'
        else:
            func_text += '  column = np.empty(col_size, dtype=np.{})\n'.format(
                el_type)
        func_text += '  status = read_parquet(arrow_readers, {}, column, np.int32({}))\n'.format(
            i, _type_to_pq_dtype_number[el_type])

    loc_vars = {}
    exec(func_text, {'sdc': sdc, 'np': np}, loc_vars)
    size_func = loc_vars['f']
    _, f_block = compile_to_numba_ir(
        size_func, {
            'get_column_size_parquet': get_column_size_parquet,
            'read_parquet': read_parquet,
            'read_parquet_str': read_parquet_str,
            'np': np,
            'sdc': sdc,
            'StringArray': StringArray
        }).blocks.popitem()

    replace_arg_nodes(f_block, [arrow_readers_var])
    out_nodes = f_block.body[:-3]
    for stmt in reversed(out_nodes):
        if stmt.target.name.startswith("column"):
            assign = ir.Assign(stmt.target, cvar, loc)
            break

    out_nodes.append(assign)
    return out_nodes
    def _gen_rebalances(self, rebalance_arrs, blocks):
        #
        for block in blocks.values():
            new_body = []
            for inst in block.body:
                # TODO: handle hiframes filter etc.
                if isinstance(inst, Parfor):
                    self._gen_rebalances(rebalance_arrs, {0: inst.init_block})
                    self._gen_rebalances(rebalance_arrs, inst.loop_body)
                if isinstance(
                        inst,
                        ir.Assign) and inst.target.name in rebalance_arrs:
                    out_arr = inst.target
                    self.func_ir._definitions[out_arr.name].remove(inst.value)
                    # hold inst results in tmp array
                    tmp_arr = ir.Var(out_arr.scope,
                                     mk_unique_var("rebalance_tmp"),
                                     out_arr.loc)
                    self.typemap[tmp_arr.name] = self.typemap[out_arr.name]
                    inst.target = tmp_arr
                    nodes = [inst]

                    def f(in_arr):  # pragma: no cover
                        out_a = sdc.distributed_api.rebalance_array(in_arr)

                    f_block = compile_to_numba_ir(
                        f, {
                            'sdc': sdc
                        }, self.typingctx, (self.typemap[tmp_arr.name], ),
                        self.typemap, self.calltypes).blocks.popitem()[1]
                    replace_arg_nodes(f_block, [tmp_arr])
                    nodes += f_block.body[:-3]  # remove none return
                    nodes[-1].target = out_arr
                    # update definitions
                    dumm_block = ir.Block(out_arr.scope, out_arr.loc)
                    dumm_block.body = nodes
                    build_definitions({0: dumm_block},
                                      self.func_ir._definitions)
                    new_body += nodes
                else:
                    new_body.append(inst)

            block.body = new_body
Exemple #6
0
def _handle_np_fromfile(assign, lhs, rhs):
    """translate np.fromfile() to native
    """
    # TODO: dtype in kws
    if len(rhs.args) != 2:  # pragma: no cover
        raise ValueError("np.fromfile(): file name and dtype expected")

    # FIXME: import here since hio has hdf5 which might not be available
    from .. import hio
    from .. import transport_seq as transport

    import llvmlite.binding as ll
    ll.add_symbol('get_file_size', transport.get_file_size)
    ll.add_symbol('file_read', hio.file_read)
    ll.add_symbol('file_read_parallel', transport.file_read_parallel)
    _fname = rhs.args[0]
    _dtype = rhs.args[1]

    def fromfile_impl(fname, dtype):
        size = get_file_size(fname)
        dtype_size = get_dtype_size(dtype)
        A = np.empty(size // dtype_size, dtype=dtype)
        file_read(fname, A, size)
        read_arr = A

    f_block = compile_to_numba_ir(
        fromfile_impl, {
            'np': np,
            'get_file_size': get_file_size,
            'file_read': file_read,
            'get_dtype_size': get_dtype_size
        }).blocks.popitem()[1]
    replace_arg_nodes(f_block, [_fname, _dtype])
    nodes = f_block.body[:-3]  # remove none return
    nodes[-1].target = lhs
    return nodes
Exemple #7
0
 def get_flat_cfg(func):
     func_ir = ir_utils.compile_to_numba_ir(func, dict())
     flat_blocks = ir_utils.flatten_labels(func_ir.blocks)
     self.assertEqual(max(flat_blocks.keys()) + 1, len(func_ir.blocks))
     return ir_utils.compute_cfg_from_blocks(flat_blocks)
Exemple #8
0
    def gen_parquet_read(self, file_name, lhs):
        scope = file_name.scope
        loc = file_name.loc

        table_types = None
        # lhs is temporary and will possibly be assigned to user variable
        assert lhs.name.startswith('$')
        if lhs.name in self.reverse_copies and self.reverse_copies[
                lhs.name] in self.locals:
            table_types = self.locals[self.reverse_copies[lhs.name]]
            self.locals.pop(self.reverse_copies[lhs.name])

        convert_types = {}
        # user-specified type conversion
        if lhs.name in self.reverse_copies and (self.reverse_copies[lhs.name] +
                                                ':convert') in self.locals:
            convert_types = self.locals[self.reverse_copies[lhs.name] +
                                        ':convert']
            self.locals.pop(self.reverse_copies[lhs.name] + ':convert')

        if table_types is None:
            fname_def = guard(get_definition, self.func_ir, file_name)
            if (not isinstance(fname_def, (ir.Const, ir.Global, ir.FreeVar))
                    or not isinstance(fname_def.value, str)):
                raise ValueError("Parquet schema not available")
            file_name_str = fname_def.value
            col_names, col_types = parquet_file_schema(file_name_str)
            # remove Pandas index if exists
            # TODO: handle index properly when indices are supported
            _rm_pd_index(col_names, col_types)
        else:
            col_names = list(table_types.keys())
            col_types = list(table_types.values())

        out_nodes = []

        # get arrow readers once

        def init_arrow_readers(fname):
            arrow_readers = get_arrow_readers(unicode_to_char_ptr(fname))

        f_block = compile_to_numba_ir(
            init_arrow_readers, {
                'get_arrow_readers': _get_arrow_readers,
                'unicode_to_char_ptr': unicode_to_char_ptr,
            }).blocks.popitem()[1]

        replace_arg_nodes(f_block, [file_name])
        out_nodes += f_block.body[:-3]
        arrow_readers_var = out_nodes[-1].target

        col_arrs = []
        for i, cname in enumerate(col_names):
            # get column type from schema
            c_type = col_types[i]
            if cname in convert_types:
                c_type = convert_types[cname].dtype

            # create a variable for column and assign type
            varname = mk_unique_var(cname)
            #self.locals[varname] = c_type
            cvar = ir.Var(scope, varname, loc)
            col_arrs.append(cvar)

            out_nodes += get_column_read_nodes(c_type, cvar, arrow_readers_var,
                                               i)

        # delete arrow readers
        def cleanup_arrow_readers(readers):
            s = del_arrow_readers(readers)

        f_block = compile_to_numba_ir(cleanup_arrow_readers, {
            'del_arrow_readers': _del_arrow_readers,
        }).blocks.popitem()[1]
        replace_arg_nodes(f_block, [arrow_readers_var])
        out_nodes += f_block.body[:-3]
        return col_names, col_arrs, out_nodes