def _slice_generic(offsets, data, valid_bits, valid_offset, start: int, end: int, step: int) -> StringArrayBuilder: builder = StringArrayBuilder(len(offsets) - 1) for i in prange(len(offsets) - 1): if len(valid_bits) > 0: byte_offset = (i + valid_offset) // 8 bit_offset = (i + valid_offset) % 8 mask = np.uint8(1 << bit_offset) valid = valid_bits[byte_offset] & mask if not valid: builder.append_null() continue str_len_bytes = offsets[i + 1] - offsets[i] char_bytes: List[bytes] = [] byte_idx = 0 while byte_idx < str_len_bytes: char_size = get_utf8_size(data[offsets[i] + byte_idx]) char_bytes.append(data[offsets[i] + byte_idx:offsets[i] + byte_idx + char_size]) byte_idx += char_size include_bytes: List[bytes] = [] # type: ignore char_idx = start if start >= -len(char_bytes) and start < 0: char_idx += len(char_bytes) true_end = end if end >= -len(char_bytes) and end < 0: true_end += len(char_bytes) # Positive step if step > 0: if char_idx < 0: char_idx = 0 while (end is None or char_idx < true_end) and char_idx < len(char_bytes): include_bytes.extend(char_bytes[char_idx]) # type: ignore char_idx += step # Negative step else: if char_idx >= len(char_bytes): char_idx = len(char_bytes) - 1 while (end is None or char_idx > true_end) and char_idx >= 0: if char_idx < len(char_bytes): include_bytes.extend(char_bytes[char_idx]) # type: ignore char_idx += step builder.append_value(include_bytes, len(include_bytes)) return builder
def _slice_pos_inputs_step(offsets, data, valid_bits, valid_offset, start: int, end: int, step: int) -> StringArrayBuilder: """ start, end >= 0 step > 1 """ builder = StringArrayBuilder(len(offsets) - 1) for i in prange(len(offsets) - 1): if len(valid_bits) > 0: byte_offset = (i + valid_offset) // 8 bit_offset = (i + valid_offset) % 8 mask = np.uint8(1 << bit_offset) valid = valid_bits[byte_offset] & mask if not valid: builder.append_null() continue str_len_bytes = offsets[i + 1] - offsets[i] char_idx = 0 byte_idx = 0 while char_idx < start and byte_idx < str_len_bytes: char_idx += 1 byte_idx += get_utf8_size(data[offsets[i] + byte_idx]) to_skip = 0 include_bytes: List[bytes] = [] while (end is None or char_idx < end) and byte_idx < str_len_bytes: char_size = get_utf8_size(data[offsets[i] + byte_idx]) if not to_skip: include_bytes.extend(data[offsets[i] + byte_idx:offsets[i] + byte_idx + char_size]) to_skip = step char_idx += 1 byte_idx += char_size to_skip -= 1 builder.append_value(include_bytes, len(include_bytes)) return builder
def _slice_pos_inputs_nostep(offsets, data, valid_bits, valid_offset, start: int, end: int) -> StringArrayBuilder: """ start, end >= 0 step == 1 """ builder = StringArrayBuilder(len(offsets) - 1) for i in prange(len(offsets) - 1): if len(valid_bits) > 0: byte_offset = (i + valid_offset) // 8 bit_offset = (i + valid_offset) % 8 mask = np.uint8(1 << bit_offset) valid = valid_bits[byte_offset] & mask if not valid: builder.append_null() continue str_len_bytes = offsets[i + 1] - offsets[i] char_idx = 0 byte_idx = 0 while char_idx < start and byte_idx < str_len_bytes: char_idx += 1 byte_idx += get_utf8_size(data[offsets[i] + byte_idx]) start_byte = offsets[i] + byte_idx while (end is None or char_idx < end) and byte_idx < str_len_bytes: char_idx += 1 byte_idx += get_utf8_size(data[offsets[i] + byte_idx]) end_byte = offsets[i] + byte_idx builder.append_value(data[start_byte:end_byte], end_byte - start_byte) return builder