Exemple #1
0
        def _sdc_take_list_str_impl(data, indexes):
            res_size = 0
            for i in numba.prange(len(indexes)):
                res_size += len(indexes[i])
            nan_mask = numpy.zeros(res_size, dtype=numpy.bool_)
            num_total_bytes = 0
            for i in numba.prange(len(indexes)):
                start = 0
                for l in range(len(indexes[0:i])):
                    start += len(indexes[l])
                current_pos = start
                for j in range(len(indexes[i])):
                    num_total_bytes += get_utf8_size(data[indexes[i][j]])
                    if isna(data, indexes[i][j]):
                        nan_mask[current_pos] = True
                    current_pos += 1
            res_arr = pre_alloc_string_array(res_size, num_total_bytes)
            for i in numba.prange(len(indexes)):
                start = 0
                for l in range(len(indexes[0:i])):
                    start += len(indexes[l])
                current_pos = start
                for j in range(len(indexes[i])):
                    res_arr[current_pos] = data[indexes[i][j]]
                    if nan_mask[current_pos]:
                        str_arr_set_na(res_arr, current_pos)
                    current_pos += 1

            return res_arr
Exemple #2
0
        def _sdc_take_str_arr_impl(data, indexes):
            res_size = len(indexes)
            nan_mask = numpy.zeros(res_size, dtype=numpy.bool_)
            num_total_bytes = 0
            for i in numba.prange(res_size):
                num_total_bytes += get_utf8_size(data[indexes[i]])
                if isna(data, indexes[i]):
                    nan_mask[i] = True

            res_arr = pre_alloc_string_array(res_size, num_total_bytes)
            for i in numpy.arange(res_size):
                res_arr[i] = data[indexes[i]]
                if nan_mask[i]:
                    str_arr_set_na(res_arr, i)

            return res_arr
Exemple #3
0
def fill_str_array(data, size, push_back=True):
    """
    Fill StringArrayType array with given values to reach the size
    """

    string_array_size = len(data)
    nan_array_size = size - string_array_size
    num_chars = sdc.str_arr_ext.num_total_chars(data)

    result_data = sdc.str_arr_ext.pre_alloc_string_array(size, num_chars)

    # Keep NaN values of initial array
    arr_is_na_mask = numpy.array(
        [sdc.hiframes.api.isna(data, i) for i in range(string_array_size)])
    data_str_list = sdc.str_arr_ext.to_string_list(data)
    nan_list = [''] * nan_array_size

    result_list = data_str_list + nan_list if push_back else nan_list + data_str_list
    cp_str_list_to_array(result_data, result_list)

    # Batch=64 iteration to avoid threads competition
    batch_size = 64
    if push_back:
        for i in numba.prange(size // batch_size + 1):
            for j in range(i * batch_size, min((i + 1) * batch_size, size)):
                if j < string_array_size:
                    if arr_is_na_mask[j]:
                        str_arr_set_na(result_data, j)
                else:
                    str_arr_set_na(result_data, j)

    else:
        for i in numba.prange(size // batch_size + 1):
            for j in range(i * batch_size, min((i + 1) * batch_size, size)):
                if j < nan_array_size:
                    str_arr_set_na(result_data, j)
                else:
                    str_arr_j = j - nan_array_size
                    if arr_is_na_mask[str_arr_j]:
                        str_arr_set_na(result_data, j)

    return result_data
Exemple #4
0
def setitem_arr_nan_overload(arr, ind):
    if isinstance(arr.dtype, types.Float):
        return setitem_arr_nan

    if isinstance(arr.dtype, (types.NPDatetime, types.NPTimedelta)):
        nat = arr.dtype('NaT')

        def _setnan_impl(arr, ind):
            arr[ind] = nat

        return _setnan_impl

    if arr == string_array_type:
        return lambda arr, ind: str_arr_set_na(arr, ind)
    # TODO: support strings, bools, etc.
    # XXX: set NA values in bool arrays to False
    # FIXME: replace with proper NaN
    if arr.dtype == types.bool_:

        def b_set(arr, ind):
            arr[ind] = False

        return b_set

    if isinstance(arr, CategoricalArray):

        def setitem_arr_nan_cat(arr, ind):
            int_arr = sdc.hiframes.pd_categorical_ext.cat_array_to_int(arr)
            int_arr[ind] = -1

        return setitem_arr_nan_cat

    # XXX set integer NA to 0 to avoid unexpected errors
    # TODO: convert integer to float if nan
    if isinstance(arr.dtype, types.Integer):

        def setitem_arr_nan_int(arr, ind):
            arr[ind] = 0

        return setitem_arr_nan_int
    return lambda arr, ind: None
Exemple #5
0
        def sdc_join_series_indexes_impl(left, right):

            # allocate result arrays
            lsize = len(left)
            rsize = len(right)
            est_total_size = int(1.1 * (lsize + rsize))

            lidx = numpy.empty(est_total_size, numpy.int64)
            ridx = numpy.empty(est_total_size, numpy.int64)

            # use Series.sort_values since argsort for StringArrays not implemented
            original_left_series = pandas.Series(left)
            original_right_series = pandas.Series(right)

            # sort arrays saving the old positions
            left_series = original_left_series.sort_values(kind='mergesort')
            right_series = original_right_series.sort_values(kind='mergesort')
            sorted_left = left_series._index
            sorted_right = right_series._index

            i, j, k = 0, 0, 0
            while (i < lsize and j < rsize):
                lidx = _hpat_ensure_array_capacity(k + 1, lidx)
                ridx = _hpat_ensure_array_capacity(k + 1, ridx)

                left_index = left[sorted_left[i]]
                right_index = right[sorted_right[j]]

                if (left_index < right_index):
                    lidx[k] = sorted_left[i]
                    ridx[k] = -1
                    i += 1
                    k += 1
                elif (left_index > right_index):
                    lidx[k] = -1
                    ridx[k] = sorted_right[j]
                    j += 1
                    k += 1
                else:
                    # find ends of sequences of equal index values in left and right
                    ni, nj = i, j
                    while (ni < lsize and left[sorted_left[ni]] == left_index):
                        ni += 1
                    while (nj < rsize
                           and right[sorted_right[nj]] == right_index):
                        nj += 1

                    # join the blocks found into results
                    for s in numpy.arange(i, ni, 1):
                        block_size = nj - j
                        to_lidx = numpy.repeat(sorted_left[s], block_size)
                        to_ridx = numpy.array(
                            [sorted_right[k] for k in numpy.arange(j, nj, 1)],
                            numpy.int64)

                        lidx = _hpat_ensure_array_capacity(
                            k + block_size, lidx)
                        ridx = _hpat_ensure_array_capacity(
                            k + block_size, ridx)

                        lidx[k:k + block_size] = to_lidx
                        ridx[k:k + block_size] = to_ridx
                        k += block_size
                    i = ni
                    j = nj

            # fill the end of joined with remaining part of left or right
            if i < lsize:
                block_size = lsize - i
                lidx = _hpat_ensure_array_capacity(k + block_size, lidx)
                ridx = _hpat_ensure_array_capacity(k + block_size, ridx)
                ridx[k:k + block_size] = numpy.repeat(-1, block_size)
                while i < lsize:
                    lidx[k] = sorted_left[i]
                    i += 1
                    k += 1

            elif j < rsize:
                block_size = rsize - j
                lidx = _hpat_ensure_array_capacity(k + block_size, lidx)
                ridx = _hpat_ensure_array_capacity(k + block_size, ridx)
                lidx[k:k + block_size] = numpy.repeat(-1, block_size)
                while j < rsize:
                    ridx[k] = sorted_right[j]
                    j += 1
                    k += 1

            # count total number of characters and allocate joined array
            total_joined_size = k
            num_chars_in_joined = 0
            for i in numpy.arange(total_joined_size):
                if lidx[i] != -1:
                    num_chars_in_joined += len(left[lidx[i]])
                elif ridx[i] != -1:
                    num_chars_in_joined += len(right[ridx[i]])

            joined = pre_alloc_string_array(total_joined_size,
                                            num_chars_in_joined)

            # iterate over joined and fill it with indexes using lidx and ridx indexers
            for i in numpy.arange(total_joined_size):
                if lidx[i] != -1:
                    joined[i] = left[lidx[i]]
                    if (str_arr_is_na(left, lidx[i])):
                        str_arr_set_na(joined, i)
                elif ridx[i] != -1:
                    joined[i] = right[ridx[i]]
                    if (str_arr_is_na(right, ridx[i])):
                        str_arr_set_na(joined, i)
                else:
                    str_arr_set_na(joined, i)

            return joined, lidx, ridx