Ejemplo n.º 1
0
    def generic(self, args, kws):
        from hpat.str_arr_ext import is_str_arr_typ
        assert not kws
        [va, vb] = args
        # if one of the inputs is string array
        if is_str_series_typ(va) or is_str_series_typ(vb):
            # inputs should be either string array or string
            assert is_str_arr_typ(va) or va == string_type
            assert is_str_arr_typ(vb) or vb == string_type
            return signature(SeriesType(types.boolean), va, vb)

        if ((is_dt64_series_typ(va) and vb == string_type)
                or (is_dt64_series_typ(vb) and va == string_type)):
            return signature(SeriesType(types.boolean), va, vb)
Ejemplo n.º 2
0
def concat_overload(arr_list):
    # all string input case
    # TODO: handle numerics to string casting case
    if (isinstance(arr_list, types.UniTuple)
            and is_str_arr_typ(arr_list.dtype)):
        def string_concat_impl(in_arrs):
            # preallocate the output
            num_strs = 0
            num_chars = 0
            for A in in_arrs:
                arr = dummy_unbox_series(A)
                num_strs += len(arr)
                num_chars += hpat.str_arr_ext.num_total_chars(arr)
            out_arr = hpat.str_arr_ext.pre_alloc_string_array(num_strs, num_chars)
            # copy data to output
            curr_str_ind = 0
            curr_chars_ind = 0
            for A in in_arrs:
                arr = dummy_unbox_series(A)
                hpat.str_arr_ext.set_string_array_range(
                    out_arr, arr, curr_str_ind, curr_chars_ind)
                curr_str_ind += len(arr)
                curr_chars_ind += hpat.str_arr_ext.num_total_chars(arr)
            return out_arr

        return string_concat_impl
    for typ in arr_list:
        if not isinstance(typ, types.Array):
            raise ValueError("concat supports only numerical and string arrays")
    # numerical input
    return lambda a: np.concatenate(dummy_unbox_series(a))
Ejemplo n.º 3
0
    def _handle_string_array_expr(self, lhs, rhs, assign):
        # convert str_arr==str into parfor
        if (rhs.op == 'binop' and rhs.fn in ['==', '!=', '>=', '>', '<=', '<']
                and (is_str_arr_typ(self.typemap[rhs.lhs.name])
                     or is_str_arr_typ(self.typemap[rhs.rhs.name]))):
            arg1 = rhs.lhs
            arg2 = rhs.rhs
            arg1_access = 'A'
            arg2_access = 'B'
            len_call = 'len(A)'
            if is_str_arr_typ(self.typemap[arg1.name]):
                arg1_access = 'A[i]'
                # replace type now for correct typing of len, etc.
                self.typemap.pop(arg1.name)
                self.typemap[arg1.name] = string_array_type

            if is_str_arr_typ(self.typemap[arg2.name]):
                arg1_access = 'B[i]'
                len_call = 'len(B)'
                self.typemap.pop(arg2.name)
                self.typemap[arg2.name] = string_array_type

            func_text = 'def f(A, B):\n'
            func_text += '  l = {}\n'.format(len_call)
            func_text += '  S = np.empty(l, dtype=np.bool_)\n'
            func_text += '  for i in numba.parfor.internal_prange(l):\n'
            func_text += '    S[i] = {} {} {}\n'.format(
                arg1_access, rhs.fn, arg2_access)

            loc_vars = {}
            exec(func_text, {}, loc_vars)
            f = loc_vars['f']
            f_blocks = compile_to_numba_ir(
                f, {
                    'numba': numba,
                    'np': np
                }, self.typingctx,
                (if_series_to_array_type(self.typemap[arg1.name]),
                 if_series_to_array_type(self.typemap[arg2.name])),
                self.typemap, self.calltypes).blocks
            replace_arg_nodes(f_blocks[min(f_blocks.keys())], [arg1, arg2])
            # replace == expression with result of parfor (S)
            # S is target of last statement in 1st block of f
            assign.value = f_blocks[min(f_blocks.keys())].body[-2].target
            return (f_blocks, [assign])

        return None
Ejemplo n.º 4
0
    def generic(self, args, kws):
        assert not kws
        assert len(args) == 1
        arr_list = args[0]
        if (isinstance(arr_list, types.UniTuple)
                and is_str_arr_typ(arr_list.dtype)):
            ret_typ = string_array_type
        else:
            # use typer of np.concatenate
            arr_list_to_arr = if_series_to_array_type(arr_list)
            ret_typ = numba.typing.npydecl.NdConcatenate(self.context).generic()(arr_list_to_arr)

        return signature(ret_typ, arr_list)
Ejemplo n.º 5
0
def init_set_string_array(in_typ):
    if is_str_arr_typ(in_typ):

        def f(A):
            str_arr = dummy_unbox_series(A)
            str_set = init_set_string()
            n = len(str_arr)
            for i in range(n):
                str = str_arr[i]
                str_set.add(str)
                hpat.str_ext.del_str(str)
            return str_set

        return f
Ejemplo n.º 6
0
def populate_str_arr_from_set(typingctx, in_set_typ, in_str_arr_typ=None):
    assert in_set_typ == set_string_type
    assert is_str_arr_typ(in_str_arr_typ)

    def codegen(context, builder, sig, args):
        in_set, in_str_arr = args

        string_array = context.make_helper(builder, string_array_type,
                                           in_str_arr)

        fnty = lir.FunctionType(lir.VoidType(), [
            lir.IntType(8).as_pointer(),
            lir.IntType(32).as_pointer(),
            lir.IntType(8).as_pointer(),
        ])
        fn_getitem = builder.module.get_or_insert_function(
            fnty, name="populate_str_arr_from_set")
        builder.call(fn_getitem,
                     [in_set, string_array.offsets, string_array.data])
        return context.get_dummy_value()

    return types.void(set_string_type, string_array_type), codegen
Ejemplo n.º 7
0
def build_set(A):
    if is_str_arr_typ(A):
        return _build_str_set_impl
    else:
        return lambda A: set(A)
Ejemplo n.º 8
0
def init_set_string_array(A):
    if is_str_arr_typ(A):
        return _build_str_set_impl
Ejemplo n.º 9
0
def nunique_overload_parallel(arr_typ):
    # TODO: extend to other types
    sum_op = hpat.distributed_api.Reduce_Type.Sum.value
    if is_str_arr_typ(arr_typ):
        int32_typ_enum = np.int32(_h5_typ_table[types.int32])
        char_typ_enum = np.int32(_h5_typ_table[types.uint8])

        def nunique_par_str(A):
            uniq_A = hpat.utils.to_array(set(A))
            n_strs = len(uniq_A)
            n_pes = hpat.distributed_api.get_size()
            # send recv counts for the number of strings
            send_counts, recv_counts = hpat.hiframes_join.send_recv_counts_new(
                uniq_A)
            send_disp = hpat.hiframes_join.calc_disp(send_counts)
            recv_disp = hpat.hiframes_join.calc_disp(recv_counts)
            recv_size = recv_counts.sum()
            # send recv counts for the number of chars
            send_chars_count, recv_chars_count = set_recv_counts_chars(uniq_A)
            send_disp_chars = hpat.hiframes_join.calc_disp(send_chars_count)
            recv_disp_chars = hpat.hiframes_join.calc_disp(recv_chars_count)
            recv_num_chars = recv_chars_count.sum()
            n_all_chars = hpat.str_arr_ext.num_total_chars(uniq_A)

            # allocate send recv arrays
            send_arr_lens = np.empty(n_strs,
                                     np.uint32)  # XXX offset type is uint32
            send_arr_chars = np.empty(n_all_chars, np.uint8)
            recv_arr = hpat.str_arr_ext.pre_alloc_string_array(
                recv_size, recv_num_chars)

            # populate send array
            tmp_offset = np.zeros(n_pes, dtype=np.int64)
            tmp_offset_chars = np.zeros(n_pes, dtype=np.int64)

            for i in range(n_strs):
                str = uniq_A[i]
                node_id = hash(str) % n_pes
                # lens
                ind = send_disp[node_id] + tmp_offset[node_id]
                send_arr_lens[ind] = len(str)
                tmp_offset[node_id] += 1
                # chars
                indc = send_disp_chars[node_id] + tmp_offset_chars[node_id]
                str_copy(send_arr_chars, indc, str.c_str(), len(str))
                tmp_offset_chars[node_id] += len(str)
                hpat.str_ext.del_str(str)

            # shuffle len values
            offset_ptr = hpat.str_arr_ext.get_offset_ptr(recv_arr)
            c_alltoallv(send_arr_lens.ctypes, offset_ptr, send_counts.ctypes,
                        recv_counts.ctypes, send_disp.ctypes, recv_disp.ctypes,
                        int32_typ_enum)
            data_ptr = hpat.str_arr_ext.get_data_ptr(recv_arr)
            # shuffle char values
            c_alltoallv(send_arr_chars.ctypes, data_ptr,
                        send_chars_count.ctypes, recv_chars_count.ctypes,
                        send_disp_chars.ctypes, recv_disp_chars.ctypes,
                        char_typ_enum)
            convert_len_arr_to_offset(offset_ptr, recv_size)
            loc_nuniq = len(set(recv_arr))
            return hpat.distributed_api.dist_reduce(loc_nuniq,
                                                    np.int32(sum_op))

        return nunique_par_str

    assert arr_typ == types.Array(types.int64, 1,
                                  'C'), "only in64 for parallel nunique"

    def nunique_par(A):
        uniq_A = hpat.utils.to_array(set(A))
        send_counts, recv_counts = hpat.hiframes_join.send_recv_counts_new(
            uniq_A)
        send_disp = hpat.hiframes_join.calc_disp(send_counts)
        recv_disp = hpat.hiframes_join.calc_disp(recv_counts)
        recv_size = recv_counts.sum()
        # (send_counts, recv_counts, send_disp, recv_disp,
        #  recv_size) = hpat.hiframes_join.get_sendrecv_counts(uniq_A)
        send_arr = np.empty_like(uniq_A)
        recv_arr = np.empty(recv_size, uniq_A.dtype)
        hpat.hiframes_join.shuffle_data(send_counts, recv_counts, send_disp,
                                        recv_disp, uniq_A, send_arr, recv_arr)
        loc_nuniq = len(set(recv_arr))
        return hpat.distributed_api.dist_reduce(loc_nuniq, np.int32(sum_op))

    return nunique_par