def test_sort_indices_array(): arr = pa.array([1, 2, None, 0]) result = pc.sort_indices(arr) assert result.to_pylist() == [3, 0, 1, 2] result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")]) assert result.to_pylist() == [3, 0, 1, 2] result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")]) assert result.to_pylist() == [1, 0, 3, 2] result = pc.sort_indices( arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")])) assert result.to_pylist() == [1, 0, 3, 2]
def sort_and_partition(self, boundaries: List[T], key: SortKeyT, descending: bool) -> List["Block[T]"]: if len(key) > 1: raise NotImplementedError( "sorting by multiple columns is not supported yet") import pyarrow.compute as pac indices = pac.sort_indices(self._table, sort_keys=key) table = self._table.take(indices) if len(boundaries) == 0: return [table] # For each boundary value, count the number of items that are less # than it. Since the block is sorted, these counts partition the items # such that boundaries[i] <= x < boundaries[i + 1] for each x in # partition[i]. If `descending` is true, `boundaries` would also be # in descending order and we only need to count the number of items # *greater than* the boundary value instead. col, _ = key[0] comp_fn = pac.greater if descending else pac.less boundary_indices = [ pac.sum(comp_fn(table[col], b)).as_py() for b in boundaries ] ret = [] prev_i = 0 for i in boundary_indices: ret.append(table.slice(prev_i, i - prev_i)) prev_i = i ret.append(table.slice(prev_i)) return ret
def create_timestamps(self) -> pa.Table: """ :return: converts the audio metadata into a data table """ result_array = [[], [], []] for m in self.metadata: timestamps = calc_evenly_sampled_timestamps( m[0], m[1].num_rows, self.sample_interval_micros) result_array[0].extend(timestamps) result_array[1].extend(timestamps) result_array[2].extend(m[1]["microphone"].to_numpy()) for gs, ge in self.gaps: num_samples = int((ge - gs) / self.sample_interval_micros) - 1 timestamps = calc_evenly_sampled_timestamps( gs + self.sample_interval_micros, num_samples, self.sample_interval_micros) gap_array = [timestamps, np.full(len(timestamps), np.nan)] result_array[0].extend(gap_array[0]) result_array[1].extend(gap_array[0]) result_array[2].extend(gap_array[1]) ptable = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS, result_array))) return pc.take( ptable, pc.sort_indices(ptable, sort_keys=[("timestamps", "ascending")]))
def test_roundtrip_multi_partitioned(tmp_path: pathlib.Path, sample_data: pa.Table): write_deltalake(str(tmp_path), sample_data, partition_by=["int32", "bool"]) delta_table = DeltaTable(str(tmp_path)) assert delta_table.pyarrow_schema() == sample_data.schema table = delta_table.to_pyarrow_table() table = table.take(pc.sort_indices(table["int64"])) assert table == sample_data
def sort_and_partition(self, boundaries: List[T], key: SortKeyT, descending: bool) -> List["Block[T]"]: if len(key) > 1: raise NotImplementedError( "sorting by multiple columns is not supported yet") if self._table.num_rows == 0: # If the pyarrow table is empty we may not have schema # so calling sort_indices() will raise an error. return [ pyarrow.Table.from_pydict({}) for _ in range(len(boundaries) + 1) ] import pyarrow.compute as pac indices = pac.sort_indices(self._table, sort_keys=key) table = self._table.take(indices) if len(boundaries) == 0: return [table] # For each boundary value, count the number of items that are less # than it. Since the block is sorted, these counts partition the items # such that boundaries[i] <= x < boundaries[i + 1] for each x in # partition[i]. If `descending` is true, `boundaries` would also be # in descending order and we only need to count the number of items # *greater than* the boundary value instead. col, _ = key[0] comp_fn = pac.greater if descending else pac.less # TODO(ekl) this is O(n^2) but in practice it's much faster than the # O(n) algorithm, could be optimized. boundary_indices = [ pac.sum(comp_fn(table[col], b)).as_py() for b in boundaries ] ### Compute the boundary indices in O(n) time via scan. # noqa # boundary_indices = [] # remaining = boundaries.copy() # values = table[col] # for i, x in enumerate(values): # while remaining and not comp_fn(x, remaining[0]).as_py(): # remaining.pop(0) # boundary_indices.append(i) # for _ in remaining: # boundary_indices.append(len(values)) ret = [] prev_i = 0 for i in boundary_indices: # Slices need to be copied to avoid including the base table # during serialization. ret.append(_copy_table(table.slice(prev_i, i - prev_i))) prev_i = i ret.append(_copy_table(table.slice(prev_i))) return ret
def test_sort_indices_table(): table = pa.table({"a": [1, 1, 0], "b": [1, 0, 1]}) result = pc.sort_indices(table, sort_keys=[("a", "ascending")]) assert result.to_pylist() == [2, 0, 1] result = pc.sort_indices(table, sort_keys=[("a", "ascending"), ("b", "ascending")]) assert result.to_pylist() == [2, 1, 0] with pytest.raises(ValueError, match="Must specify one or more sort keys"): pc.sort_indices(table) with pytest.raises(ValueError, match="Nonexistent sort key column"): pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) with pytest.raises(ValueError, match="not a valid order"): pc.sort_indices(table, sort_keys=[("a", "nonscending")])
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"]) # truncate the title after 101 characters (matching display logic) truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="") table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title) # ensure all dictionaries in the file use the same key/value mappings table = table.unify_dictionaries() # filter out non-numeric dates (e.g. null, "1850-1853") # matches the hack in index.js:37 mask = pc.invert(pc.is_null(table.column("date"))) table = table.filter(mask) # sorting by the date improves the loading aesthetics # comment this out to exactly match the original appearance indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = pc.take(table, indices) # after sorting replace ix with an accurate row index indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32())) temp_path.unlink() local = fs.LocalFileSystem() with local.open_output_stream(str(target_path)) as file: with pa.RecordBatchStreamWriter(file, table.schema) as writer: writer.write_table(table, 10000)
def fill_gaps( arrow_df: pa.Table, gaps: List[Tuple[float, float]], sample_interval_micros: float, copy: bool = False) -> Tuple[pa.Table, List[Tuple[float, float]]]: """ fills gaps in the table with np.nan or interpolated values by interpolating timestamps based on the calculated sample interval :param arrow_df: pyarrow table with data. first column is "timestamps" :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps :param sample_interval_micros: known sample interval of the data points :param copy: if True, copy the data points, otherwise interpolate from edges, default False :return: table without gaps and the list of gaps """ # extract the necessary information to compute gap size and gap timestamps data_time_stamps = arrow_df["timestamps"].to_numpy() if len(data_time_stamps) > 1: data_duration = data_time_stamps[-1] - data_time_stamps[0] expected_samples = ( np.floor(data_duration / sample_interval_micros) + (1 if data_duration % sample_interval_micros >= sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1 if expected_samples > len(data_time_stamps): if copy: pcm = DataPointCreationMode["COPY"] else: pcm = DataPointCreationMode["NAN"] # make it safe to alter the gap values my_gaps = check_gap_list(gaps, data_time_stamps[0], data_time_stamps[-1]) for gap in my_gaps: # if timestamps are around gaps, we have to update the values before_start = np.argwhere( [t <= gap[0] for t in data_time_stamps]) after_end = np.argwhere( [t >= gap[1] for t in data_time_stamps]) if len(before_start) > 0: before_start = before_start[-1][0] # sim = gap[0] - data_time_stamps[before_start] # result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm) gap = (data_time_stamps[before_start], gap[1]) else: before_start = None if len(after_end) > 0: after_end = after_end[0][0] # sim = gap[1] - data_time_stamps[after_end] gap = (gap[0], data_time_stamps[after_end]) else: after_end = None num_new_points = int( (gap[1] - gap[0]) / sample_interval_micros) - 1 if before_start is not None: arrow_df = add_data_points_to_df(arrow_df, before_start, sample_interval_micros, num_new_points, pcm) elif after_end is not None: arrow_df = add_data_points_to_df(arrow_df, after_end, -sample_interval_micros, num_new_points, pcm) indic = pc.sort_indices(arrow_df, sort_keys=[("timestamps", "ascending")]) return arrow_df.take(indic), gaps return arrow_df, gaps
def _sort_table_on_real_then_date(table: pa.Table) -> pa.Table: indices = pc.sort_indices( table, sort_keys=[("REAL", "ascending"), ("DATE", "ascending")] ) sorted_table = table.take(indices) return sorted_table
def dedupe(compaction_artifact_s3_bucket: str, compacted_partition_locator: Dict[str, Any], new_compacted_partition_locator: Dict[str, Any], object_ids: List[Any], sort_keys: List[Tuple[str, str]], max_records_per_index_file: int, max_records_per_materialized_file: int, num_materialize_buckets: int, dedupe_task_index: int, record_counts_pending_materialize, prev_compacted_delta_stream_position: Optional[int], pk_index_version: int): logger.info(f"Starting dedupe task...") # TODO: there is a risk of running out of memory here in cases of severe # skew of primary key updates object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids] logger.info(f"Getting delta file envelope groups object refs...") delta_file_envelope_groups_list = ray.get(object_refs) hb_index_to_delta_file_envelopes_list = defaultdict(list) for delta_file_envelope_groups in delta_file_envelope_groups_list: for hb_idx in range(len(delta_file_envelope_groups)): dfes = delta_file_envelope_groups[hb_idx] if dfes is not None: hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes) src_file_id_to_row_indices = defaultdict(list) deduped_tables = [] logger.info(f"Running {len(hb_index_to_delta_file_envelopes_list)} " f"dedupe rounds...") for hb_idx, dfe_list in hb_index_to_delta_file_envelopes_list.items(): table = union_primary_key_indices( compaction_artifact_s3_bucket, compacted_partition_locator, prev_compacted_delta_stream_position, hb_idx, dfe_list, pk_index_version, ) logger.info("Dedupe round input record count: ", len(table)) # sort by sort keys if len(sort_keys): # TODO: convert to O(N) dedupe w/ sort keys sort_keys.extend([ (sc._PARTITION_STREAM_POSITION_COLUMN_NAME, "ascending"), (sc._ORDERED_FILE_IDX_COLUMN_NAME, "ascending"), ]) table = table.take(pc.sort_indices(table, sort_key=sort_keys)) # drop duplicates by primary key hash column table = drop_duplicates_by_primary_key_hash(table) table = table.drop([sc._DELTA_TYPE_COLUMN_NAME]) logger.info("Dedupe round output record count: ", len(table)) deduped_tables.append((hb_idx, table)) stream_position_col = sc.stream_position_column_np(table) file_idx_col = sc.file_index_column_np(table) row_idx_col = sc.record_index_column_np(table) is_source_col = sc.is_source_column_np(table) for row_idx in range(len(table)): src_file_id = ( is_source_col[row_idx], stream_position_col[row_idx], file_idx_col[row_idx], ) # TODO(pdames): merge contiguous record number ranges src_file_id_to_row_indices[src_file_id].append( row_idx_col[row_idx]) logger.info(f"Finished all dedupe rounds...") mat_bucket_to_src_file_record_count = defaultdict(dict) mat_bucket_to_src_file_records = defaultdict(dict) for src_file_id, src_row_indices in src_file_id_to_row_indices.items(): mat_bucket = file_id_to_mat_bucket_index( src_file_id, num_materialize_buckets, ) mat_bucket_to_src_file_records[mat_bucket][src_file_id] = np.array( src_row_indices, ) mat_bucket_to_src_file_record_count[mat_bucket][src_file_id] = \ len(src_row_indices) mat_bucket_to_dd_idx_obj_id = {} object_refs = [] for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items(): object_ref = ray.put(src_file_records) object_refs.append(object_ref) mat_bucket_to_dd_idx_obj_id[mat_bucket] = ( dedupe_task_index, cloudpickle.dumps(object_ref), ) logger.info(f"Count of materialize buckets with object refs: " f"{len(mat_bucket_to_dd_idx_obj_id)}") record_counts_pending_materialize.add_record_counts( dedupe_task_index, mat_bucket_to_src_file_record_count, ) # wait for all dedupe tasks to reach this point before continuing logger.info( f"Waiting for all dedupe tasks to finish writing record counts...") finalized = False while not finalized: finalized = ray.get( record_counts_pending_materialize.is_finalized.remote()) time.sleep(0.25) logger.info(f"Writing destination primary key index...") dedupe_result = write_dest_primary_key_index( compaction_artifact_s3_bucket, new_compacted_partition_locator, max_records_per_index_file, max_records_per_materialized_file, num_materialize_buckets, dedupe_task_index, deduped_tables, record_counts_pending_materialize.get_record_counts(), 1, ) logger.info(f"Finished dedupe task...") return mat_bucket_to_dd_idx_obj_id, object_refs, dedupe_result
def sort(table: "pyarrow.Table", key: "SortKeyT", descending: bool) -> "pyarrow.Table": import pyarrow.compute as pac indices = pac.sort_indices(table, sort_keys=key) return table.take(indices)
type(metadata) print(parquet_file.schema) group = metadata.row_group(0) print(group) dir(group) vendor_col = group.column(0) print(vendor_col) tip_col = group.column(13) print(tip_col) pq.write_table(table, "202001_std.parquet", compression="ZSTD") print(len(table["tip_amount"].unique())) silly_table = pa.Table.from_arrays([ table["VendorID"], table["VendorID"].take( pc.sort_indices(table["VendorID"])) ], ["unordered", "ordered"]) pq.write_table(silly_table, "silly.parquet") silly = pq.ParquetFile("silly.parquet") silly_group = silly.metadata.row_group(0) print(silly_group.column(0)) print(silly_group.column(1)) silly_table["ordered"].unique() tp["VendorID"].value_counts(dropna=False) # reduce time from ms to s # partitioned datasets # rows groups (maybe with a single dataset) len(tp.fare_amount.unique())
def materialize( source_partition_locator: Dict[str, Any], delta_staging_area: Dict[str, Any], mat_bucket_index: int, dedupe_task_idx_and_obj_id_tuples: List[Tuple[int, Any]], max_records_per_output_file: int, compacted_file_content_type: ContentType, deltacat_storage=unimplemented_deltacat_storage): logger.info(f"Starting materialize task...") dest_partition_locator = dsa.get_partition_locator(delta_staging_area) dedupe_task_idx_and_obj_ref_tuples = [ ( t[0], cloudpickle.loads(t[1]) for t in dedupe_task_idx_and_obj_id_tuples ) ] logger.info(f"Resolved materialize task obj refs...") dedupe_task_indices, obj_refs = zip( *dedupe_task_idx_and_obj_ref_tuples ) # this depends on `ray.get` result order matching input order, as per the # contract established in: https://github.com/ray-project/ray/pull/16763 src_file_records_list = ray.get(obj_refs) all_src_file_records = defaultdict(list) for i in range(len(src_file_records_list)): dedupe_task_idx = dedupe_task_indices[i] src_file_records = src_file_records_list[i] for src_file_id, record_numbers in src_file_records.items(): all_src_file_records[src_file_id].append( (record_numbers, repeat(dedupe_task_idx, len(record_numbers))) ) manifest_cache = {} compacted_tables = [] for src_file_id in sorted(all_src_file_records.keys()): record_numbers_dd_task_idx_tpl_list = all_src_file_records[src_file_id] record_numbers_list, dedupe_task_idx_iterator = zip( *record_numbers_dd_task_idx_tpl_list ) is_src_partition_file = src_file_id[0] src_file_position = src_file_id[1] src_file_idx = src_file_id[2] src_file_partition_locator = source_partition_locator \ if is_src_partition_file \ else dest_partition_locator delta_locator = dl.of( src_file_partition_locator, src_file_position, ) dl_hexdigest = dl.hexdigest(delta_locator) manifest = manifest_cache.setdefault( dl_hexdigest, deltacat_storage.get_manifest(delta_locator), ) pa_table = deltacat_storage.download_manifest_entry( delta_locator, manifest, src_file_idx, ) mask_pylist = list(repeat(False, len(pa_table))) record_numbers = chain.from_iterable(record_numbers_list) for record_number in record_numbers: mask_pylist[record_number] = True mask = pa.array(mask_pylist) compacted_table = pa_table.filter(mask) # appending, sorting, taking, and dropping has 2-3X latency of a # single filter on average, and thus provides much better performance # than repeatedly filtering the table in dedupe task index order dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iterator) compacted_table = sc.append_dedupe_task_idx_col( compacted_table, dedupe_task_indices, ) pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")] compacted_table = compacted_table.take( pc.sort_indices(compacted_table, sort_keys=pa_sort_keys), ) compacted_table = compacted_table.drop( [sc._DEDUPE_TASK_IDX_COLUMN_NAME] ) compacted_tables.append(compacted_table) # TODO (pdames): save memory by writing parquet files eagerly whenever # len(compacted_table) >= max_records_per_output_file compacted_table = pa.concat_tables(compacted_tables) delta_manifest = deltacat_storage.stage_delta( compacted_table, delta_staging_area, max_records_per_entry=max_records_per_output_file, content_type=compacted_file_content_type, ) manifest = dm.get_manifest(delta_manifest) manifest_records = rsmm.get_record_count(manifest) assert(manifest_records == len(compacted_table), f"Unexpected Error: Materialized delta manifest record count " f"({manifest_records}) does not equal compacted table record count " f"({len(compacted_table)})") return mr.of( delta_manifest, mat_bucket_index, len(rsm.get_entries(manifest)), compacted_table.nbytes, rsmm.get_content_length(manifest), len(compacted_table), )