def sdc_array_equal_str_arr_impl(A, B): is_index_equal = (len(A) == len(B) and num_total_chars(A) == num_total_chars(B)) for i in numpy.arange(len(A)): if (A[i] != B[i] or str_arr_is_na(A, i) is not str_arr_is_na(B, i)): return False return is_index_equal
def sdc_check_indexes_equal_string_impl(A, B): # TODO: replace with StringArrays comparison is_index_equal = (len(A) == len(B) and num_total_chars(A) == num_total_chars(B)) for i in numpy.arange(len(A)): if (A[i] != B[i] or str_arr_is_na(A, i) is not str_arr_is_na(B, i)): return False return is_index_equal
def _append_single_string_array_impl(A, B): total_size = len(A) + len(B) total_chars = num_total_chars(A) + num_total_chars(B) new_data = sdc.str_arr_ext.pre_alloc_string_array(total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) pos += append_string_array_to(new_data, pos, B) return new_data
def getitem_str_impl(arr, slice_index, start, count): rank = sdc.distributed_api.get_rank() k = slice_index.stop # get total characters for allocation n_chars = np.uint64(0) if k > start: # if slice end is beyond the start of this subset we have to send our elements my_end = min(count, k - start) my_arr = arr[:my_end] else: my_arr = arr[:0] # get the total number of chars in our array, then gather all arrays into one # and compute total number of chars in all arrays n_chars = num_total_chars(my_arr) my_arr = sdc.distributed_api.gatherv(my_arr) n_chars = sdc.distributed_api.dist_reduce(n_chars, np.int32(reduce_op)) if rank != 0: out_arr = pre_alloc_string_array(k, n_chars) else: out_arr = my_arr # actual communication sdc.distributed_api.bcast(out_arr) return out_arr
def prealloc_impl(arr): rank = sdc.distributed_api.get_rank() n_loc = bcast_scalar(len(arr)) n_all_char = bcast_scalar(np.int64(num_total_chars(arr))) if rank != MPI_ROOT: arr = pre_alloc_string_array(n_loc, n_all_char) return arr
def _append_list_string_array_impl(A, B): array_list = [A] + list(B) total_size = numpy.array([len(arr) for arr in array_list]).sum() total_chars = numpy.array([num_total_chars(arr) for arr in array_list]).sum() new_data = sdc.str_arr_ext.pre_alloc_string_array(total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) for arr in B: pos += append_string_array_to(new_data, pos, arr) return new_data
def gatherv_str_arr_impl(data): rank = sdc.distributed_api.get_rank() n_loc = len(data) n_all_chars = num_total_chars(data) # allocate send lens arrays send_arr_lens = np.empty(n_loc, np.uint32) # XXX offset type is uint32 send_data_ptr = get_data_ptr(data) for i in range(n_loc): _str = data[i] send_arr_lens[i] = len(_str) recv_counts = gather_scalar(np.int32(n_loc)) recv_counts_char = gather_scalar(np.int32(n_all_chars)) n_total = recv_counts.sum() n_total_char = recv_counts_char.sum() # displacements all_data = StringArray(['']) # dummy arrays on non-root PEs displs = np.empty(0, np.int32) displs_char = np.empty(0, np.int32) if rank == MPI_ROOT: all_data = pre_alloc_string_array(n_total, n_total_char) displs = sdc.hiframes.join.calc_disp(recv_counts) displs_char = sdc.hiframes.join.calc_disp(recv_counts_char) offset_ptr = get_offset_ptr(all_data) data_ptr = get_data_ptr(all_data) c_gatherv( send_arr_lens.ctypes, np.int32(n_loc), offset_ptr, recv_counts.ctypes, displs.ctypes, int32_typ_enum) c_gatherv( send_data_ptr, np.int32(n_all_chars), data_ptr, recv_counts_char.ctypes, displs_char.ctypes, char_typ_enum) convert_len_arr_to_offset(offset_ptr, n_total) return all_data
def ensure_capacity_str(arr, new_size, n_chars): # new_size is right after write index new_arr = arr curr_len = len(arr) curr_num_chars = num_total_chars(arr) needed_total_chars = getitem_str_offset(arr, new_size - 1) + n_chars # TODO: corner case test #print("new alloc", new_size, curr_len, getitem_str_offset(arr, new_size-1), n_chars, curr_num_chars) if curr_len < new_size or needed_total_chars > curr_num_chars: new_len = int(2 * curr_len if curr_len < new_size else curr_len) new_num_chars = int( 2 * curr_num_chars + n_chars if needed_total_chars > curr_num_chars else curr_num_chars) new_arr = pre_alloc_string_array(new_len, new_num_chars) copy_str_arr_slice(new_arr, arr, new_size - 1) return new_arr
def bcast_str_impl(data): rank = sdc.distributed_api.get_rank() n_loc = len(data) n_all_chars = num_total_chars(data) assert n_loc < INT_MAX assert n_all_chars < INT_MAX offset_ptr = get_offset_ptr(data) data_ptr = get_data_ptr(data) if rank == MPI_ROOT: send_arr_lens = np.empty(n_loc, np.uint32) # XXX offset type is uint32 for i in range(n_loc): _str = data[i] send_arr_lens[i] = len(_str) c_bcast(send_arr_lens.ctypes, np.int32(n_loc), int32_typ_enum) else: c_bcast(offset_ptr, np.int32(n_loc), int32_typ_enum) c_bcast(data_ptr, np.int32(n_all_chars), char_typ_enum) if rank != MPI_ROOT: convert_len_arr_to_offset(offset_ptr, n_loc)
def empty_like_type_str_arr(n, arr): # average character heuristic avg_chars = 20 # heuristic if len(arr) != 0: avg_chars = num_total_chars(arr) // len(arr) return pre_alloc_string_array(n, n * avg_chars)