def group_by_pk_hash_bucket(table: pa.Table, num_buckets: int, primary_keys: List[str]) -> np.ndarray: # generate the primary key digest column all_pk_column_fields = [] for pk_name in primary_keys: # casting a primary key column to numpy also ensures no nulls exist column_fields = table[pk_name].to_numpy() all_pk_column_fields.append(column_fields) hash_column_generator = hash_pk_bytes_generator(all_pk_column_fields) table = sc.append_pk_hash_column(table, hash_column_generator) # drop primary key columns to free up memory table = table.drop(primary_keys) # group hash bucket record indices hash_bucket_to_indices = np.empty([num_buckets], dtype="object") record_index = 0 for digest in sc.pk_hash_column_np(table): hash_bucket = pk_digest_to_hash_bucket_index(digest, num_buckets) if hash_bucket_to_indices[hash_bucket] is None: hash_bucket_to_indices[hash_bucket] = [] hash_bucket_to_indices[hash_bucket].append(record_index) record_index += 1 # generate the ordered record number column hash_bucket_to_table = np.empty([num_buckets], dtype="object") for hash_bucket in range(len(hash_bucket_to_indices)): indices = hash_bucket_to_indices[hash_bucket] if indices: hash_bucket_to_table[hash_bucket] = sc.append_record_idx_col( table.take(indices), indices, ) return hash_bucket_to_table
def drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table: # TODO: drop all primary key occurrences for DELETE delta types value_to_last_row_idx = {} row_idx = 0 for chunk in sc.pk_hash_column(table).iterchunks(): for val in chunk.to_numpy(zero_copy_only=False): value_to_last_row_idx[val] = row_idx row_idx += 1 return table.take(list(value_to_last_row_idx.values()))
def fill_gaps( arrow_df: pa.Table, gaps: List[Tuple[float, float]], sample_interval_micros: float, copy: bool = False) -> Tuple[pa.Table, List[Tuple[float, float]]]: """ fills gaps in the table with np.nan or interpolated values by interpolating timestamps based on the calculated sample interval :param arrow_df: pyarrow table with data. first column is "timestamps" :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps :param sample_interval_micros: known sample interval of the data points :param copy: if True, copy the data points, otherwise interpolate from edges, default False :return: table without gaps and the list of gaps """ # extract the necessary information to compute gap size and gap timestamps data_time_stamps = arrow_df["timestamps"].to_numpy() if len(data_time_stamps) > 1: data_duration = data_time_stamps[-1] - data_time_stamps[0] expected_samples = ( np.floor(data_duration / sample_interval_micros) + (1 if data_duration % sample_interval_micros >= sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1 if expected_samples > len(data_time_stamps): if copy: pcm = DataPointCreationMode["COPY"] else: pcm = DataPointCreationMode["NAN"] # make it safe to alter the gap values my_gaps = check_gap_list(gaps, data_time_stamps[0], data_time_stamps[-1]) for gap in my_gaps: # if timestamps are around gaps, we have to update the values before_start = np.argwhere( [t <= gap[0] for t in data_time_stamps]) after_end = np.argwhere( [t >= gap[1] for t in data_time_stamps]) if len(before_start) > 0: before_start = before_start[-1][0] # sim = gap[0] - data_time_stamps[before_start] # result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm) gap = (data_time_stamps[before_start], gap[1]) else: before_start = None if len(after_end) > 0: after_end = after_end[0][0] # sim = gap[1] - data_time_stamps[after_end] gap = (gap[0], data_time_stamps[after_end]) else: after_end = None num_new_points = int( (gap[1] - gap[0]) / sample_interval_micros) - 1 if before_start is not None: arrow_df = add_data_points_to_df(arrow_df, before_start, sample_interval_micros, num_new_points, pcm) elif after_end is not None: arrow_df = add_data_points_to_df(arrow_df, after_end, -sample_interval_micros, num_new_points, pcm) indic = pc.sort_indices(arrow_df, sort_keys=[("timestamps", "ascending")]) return arrow_df.take(indic), gaps return arrow_df, gaps
def _sort_table_on_real_then_date(table: pa.Table) -> pa.Table: indices = pc.sort_indices( table, sort_keys=[("REAL", "ascending"), ("DATE", "ascending")] ) sorted_table = table.take(indices) return sorted_table
def make_sorted_groups(sorting_table: pa.Table, input_table: pa.Table) -> SortedGroups: if not sorting_table.num_columns: # Exactly one output group, even for empty-table input return SortedGroups( sorted_groups=pa.table({ "A": [None] }).select([]), # 1-row, 0-col table sorted_input_table= input_table, # everything is one group (maybe 0-row) group_splits=np.array([], np.int64()), ) # pyarrow 3.0.0 can't sort dictionary columns. # TODO make sort-dictionary work; nix this conversion sorting_table_without_dictionary = pa.table( [ column.cast(pa.utf8()) if pa.types.is_dictionary(column.type) else column for column in sorting_table.columns ], schema=pa.schema([ pa.field(field.name, pa.utf8()) if pa.types.is_dictionary(field.type) else field for field in [ sorting_table.schema.field(i) for i in range(len(sorting_table.schema.names)) ] ]), ) indices = pa.compute.sort_indices( sorting_table_without_dictionary, sort_keys=[(c, "ascending") for c in sorting_table_without_dictionary.column_names], ) sorted_groups_with_dups_and_nulls = sorting_table.take(indices) # Behavior we ought to DEPRECATE: to mimic Pandas, we drop all groups that # contain NULL. This is mathematically sound for Pandas' "NA" (because if # all these unknown things are the same thing, doesn't that mean we know # something about them? -- reducto ad absurdum, QED). But Workbench's NULL # is a bit closer to SQL NULL, which means "whatever you say, pal". # # This null-dropping is for backwards compat. TODO make it optional ... and # eventually nix the option and always output NULL groups. nonnull_indices = indices.filter( find_nonnull_table_mask(sorted_groups_with_dups_and_nulls)) if input_table.num_columns: sorted_input_table = input_table.take(nonnull_indices) else: # Don't .take() on a zero-column Arrow table: its .num_rows would change # # All rows are identical, so .slice() gives the table we want sorted_input_table = input_table.slice(0, len(nonnull_indices)) sorted_groups_with_dups = sorting_table.take(nonnull_indices) # "is_dup": find each row in sorted_groups_with_dups that is _equal_ to # the row before it. (The first value compares the first and second row.) # # We start assuming all are equal; then we search for inequality if len(sorted_groups_with_dups): is_dup = pa.array(np.ones(len(sorted_groups_with_dups) - 1), pa.bool_()) for column in sorted_groups_with_dups.itercolumns(): chunk = column.chunks[0] if pa.types.is_dictionary(chunk.type): chunk = chunk.indices first = chunk.slice(0, len(column) - 1) second = chunk.slice(1) # TODO when we support NULL groups: # both_null = pa.compute.and_(first.is_null(), second.is_null()) # both_equal_if_not_null = pa.compute.equal(first, second) # both_equal = pa.compute.fill_null(both_equal_if_not_null, False) # value_is_dup = pa.compute.or_(both_null, both_equal) # ... and for now, it's simply: value_is_dup = pa.compute.equal(first, second) is_dup = pa.compute.and_(is_dup, value_is_dup) group_splits = np.where(~(is_dup.to_numpy( zero_copy_only=False)))[0] + 1 sorted_groups = reencode_dictionaries( sorted_groups_with_dups.take(np.insert(group_splits, 0, 0))) else: sorted_groups = sorted_groups_with_dups group_splits = np.array([], np.int64()) return SortedGroups( sorted_groups=sorted_groups, sorted_input_table=sorted_input_table, group_splits=group_splits, )