def _sdc_take_list_str_impl(data, indexes): res_size = 0 for i in numba.prange(len(indexes)): res_size += len(indexes[i]) nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) num_total_bytes = 0 for i in numba.prange(len(indexes)): start = 0 for l in range(len(indexes[0:i])): start += len(indexes[l]) current_pos = start for j in range(len(indexes[i])): num_total_bytes += get_utf8_size(data[indexes[i][j]]) if isna(data, indexes[i][j]): nan_mask[current_pos] = True current_pos += 1 res_arr = pre_alloc_string_array(res_size, num_total_bytes) for i in numba.prange(len(indexes)): start = 0 for l in range(len(indexes[0:i])): start += len(indexes[l]) current_pos = start for j in range(len(indexes[i])): res_arr[current_pos] = data[indexes[i][j]] if nan_mask[current_pos]: str_arr_set_na(res_arr, current_pos) current_pos += 1 return res_arr
def _sdc_take_str_arr_impl(data, indexes): res_size = len(indexes) nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) num_total_bytes = 0 for i in numba.prange(res_size): num_total_bytes += get_utf8_size(data[indexes[i]]) if isna(data, indexes[i]): nan_mask[i] = True res_arr = pre_alloc_string_array(res_size, num_total_bytes) for i in numpy.arange(res_size): res_arr[i] = data[indexes[i]] if nan_mask[i]: str_arr_set_na(res_arr, i) return res_arr
def fill_str_array(data, size, push_back=True): """ Fill StringArrayType array with given values to reach the size """ string_array_size = len(data) nan_array_size = size - string_array_size num_chars = sdc.str_arr_ext.num_total_chars(data) result_data = sdc.str_arr_ext.pre_alloc_string_array(size, num_chars) # Keep NaN values of initial array arr_is_na_mask = numpy.array( [sdc.hiframes.api.isna(data, i) for i in range(string_array_size)]) data_str_list = sdc.str_arr_ext.to_string_list(data) nan_list = [''] * nan_array_size result_list = data_str_list + nan_list if push_back else nan_list + data_str_list cp_str_list_to_array(result_data, result_list) # Batch=64 iteration to avoid threads competition batch_size = 64 if push_back: for i in numba.prange(size // batch_size + 1): for j in range(i * batch_size, min((i + 1) * batch_size, size)): if j < string_array_size: if arr_is_na_mask[j]: str_arr_set_na(result_data, j) else: str_arr_set_na(result_data, j) else: for i in numba.prange(size // batch_size + 1): for j in range(i * batch_size, min((i + 1) * batch_size, size)): if j < nan_array_size: str_arr_set_na(result_data, j) else: str_arr_j = j - nan_array_size if arr_is_na_mask[str_arr_j]: str_arr_set_na(result_data, j) return result_data
def setitem_arr_nan_overload(arr, ind): if isinstance(arr.dtype, types.Float): return setitem_arr_nan if isinstance(arr.dtype, (types.NPDatetime, types.NPTimedelta)): nat = arr.dtype('NaT') def _setnan_impl(arr, ind): arr[ind] = nat return _setnan_impl if arr == string_array_type: return lambda arr, ind: str_arr_set_na(arr, ind) # TODO: support strings, bools, etc. # XXX: set NA values in bool arrays to False # FIXME: replace with proper NaN if arr.dtype == types.bool_: def b_set(arr, ind): arr[ind] = False return b_set if isinstance(arr, CategoricalArray): def setitem_arr_nan_cat(arr, ind): int_arr = sdc.hiframes.pd_categorical_ext.cat_array_to_int(arr) int_arr[ind] = -1 return setitem_arr_nan_cat # XXX set integer NA to 0 to avoid unexpected errors # TODO: convert integer to float if nan if isinstance(arr.dtype, types.Integer): def setitem_arr_nan_int(arr, ind): arr[ind] = 0 return setitem_arr_nan_int return lambda arr, ind: None
def sdc_join_series_indexes_impl(left, right): # allocate result arrays lsize = len(left) rsize = len(right) est_total_size = int(1.1 * (lsize + rsize)) lidx = numpy.empty(est_total_size, numpy.int64) ridx = numpy.empty(est_total_size, numpy.int64) # use Series.sort_values since argsort for StringArrays not implemented original_left_series = pandas.Series(left) original_right_series = pandas.Series(right) # sort arrays saving the old positions left_series = original_left_series.sort_values(kind='mergesort') right_series = original_right_series.sort_values(kind='mergesort') sorted_left = left_series._index sorted_right = right_series._index i, j, k = 0, 0, 0 while (i < lsize and j < rsize): lidx = _hpat_ensure_array_capacity(k + 1, lidx) ridx = _hpat_ensure_array_capacity(k + 1, ridx) left_index = left[sorted_left[i]] right_index = right[sorted_right[j]] if (left_index < right_index): lidx[k] = sorted_left[i] ridx[k] = -1 i += 1 k += 1 elif (left_index > right_index): lidx[k] = -1 ridx[k] = sorted_right[j] j += 1 k += 1 else: # find ends of sequences of equal index values in left and right ni, nj = i, j while (ni < lsize and left[sorted_left[ni]] == left_index): ni += 1 while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_lidx = numpy.repeat(sorted_left[s], block_size) to_ridx = numpy.array( [sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) lidx = _hpat_ensure_array_capacity( k + block_size, lidx) ridx = _hpat_ensure_array_capacity( k + block_size, ridx) lidx[k:k + block_size] = to_lidx ridx[k:k + block_size] = to_ridx k += block_size i = ni j = nj # fill the end of joined with remaining part of left or right if i < lsize: block_size = lsize - i lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) ridx[k:k + block_size] = numpy.repeat(-1, block_size) while i < lsize: lidx[k] = sorted_left[i] i += 1 k += 1 elif j < rsize: block_size = rsize - j lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k:k + block_size] = numpy.repeat(-1, block_size) while j < rsize: ridx[k] = sorted_right[j] j += 1 k += 1 # count total number of characters and allocate joined array total_joined_size = k num_chars_in_joined = 0 for i in numpy.arange(total_joined_size): if lidx[i] != -1: num_chars_in_joined += len(left[lidx[i]]) elif ridx[i] != -1: num_chars_in_joined += len(right[ridx[i]]) joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) # iterate over joined and fill it with indexes using lidx and ridx indexers for i in numpy.arange(total_joined_size): if lidx[i] != -1: joined[i] = left[lidx[i]] if (str_arr_is_na(left, lidx[i])): str_arr_set_na(joined, i) elif ridx[i] != -1: joined[i] = right[ridx[i]] if (str_arr_is_na(right, ridx[i])): str_arr_set_na(joined, i) else: str_arr_set_na(joined, i) return joined, lidx, ridx