Beispiel #1
0
def test_sort_indices_array():
    arr = pa.array([1, 2, None, 0])
    result = pc.sort_indices(arr)
    assert result.to_pylist() == [3, 0, 1, 2]
    result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")])
    assert result.to_pylist() == [3, 0, 1, 2]
    result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")])
    assert result.to_pylist() == [1, 0, 3, 2]
    result = pc.sort_indices(
        arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")]))
    assert result.to_pylist() == [1, 0, 3, 2]
Beispiel #2
0
    def sort_and_partition(self, boundaries: List[T], key: SortKeyT,
                           descending: bool) -> List["Block[T]"]:
        if len(key) > 1:
            raise NotImplementedError(
                "sorting by multiple columns is not supported yet")

        import pyarrow.compute as pac

        indices = pac.sort_indices(self._table, sort_keys=key)
        table = self._table.take(indices)
        if len(boundaries) == 0:
            return [table]

        # For each boundary value, count the number of items that are less
        # than it. Since the block is sorted, these counts partition the items
        # such that boundaries[i] <= x < boundaries[i + 1] for each x in
        # partition[i]. If `descending` is true, `boundaries` would also be
        # in descending order and we only need to count the number of items
        # *greater than* the boundary value instead.
        col, _ = key[0]
        comp_fn = pac.greater if descending else pac.less
        boundary_indices = [
            pac.sum(comp_fn(table[col], b)).as_py() for b in boundaries
        ]
        ret = []
        prev_i = 0
        for i in boundary_indices:
            ret.append(table.slice(prev_i, i - prev_i))
            prev_i = i
        ret.append(table.slice(prev_i))
        return ret
 def create_timestamps(self) -> pa.Table:
     """
     :return: converts the audio metadata into a data table
     """
     result_array = [[], [], []]
     for m in self.metadata:
         timestamps = calc_evenly_sampled_timestamps(
             m[0], m[1].num_rows, self.sample_interval_micros)
         result_array[0].extend(timestamps)
         result_array[1].extend(timestamps)
         result_array[2].extend(m[1]["microphone"].to_numpy())
     for gs, ge in self.gaps:
         num_samples = int((ge - gs) / self.sample_interval_micros) - 1
         timestamps = calc_evenly_sampled_timestamps(
             gs + self.sample_interval_micros, num_samples,
             self.sample_interval_micros)
         gap_array = [timestamps, np.full(len(timestamps), np.nan)]
         result_array[0].extend(gap_array[0])
         result_array[1].extend(gap_array[0])
         result_array[2].extend(gap_array[1])
     ptable = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS,
                                            result_array)))
     return pc.take(
         ptable,
         pc.sort_indices(ptable, sort_keys=[("timestamps", "ascending")]))
Beispiel #4
0
def test_roundtrip_multi_partitioned(tmp_path: pathlib.Path,
                                     sample_data: pa.Table):
    write_deltalake(str(tmp_path), sample_data, partition_by=["int32", "bool"])

    delta_table = DeltaTable(str(tmp_path))
    assert delta_table.pyarrow_schema() == sample_data.schema

    table = delta_table.to_pyarrow_table()
    table = table.take(pc.sort_indices(table["int64"]))
    assert table == sample_data
Beispiel #5
0
    def sort_and_partition(self, boundaries: List[T], key: SortKeyT,
                           descending: bool) -> List["Block[T]"]:
        if len(key) > 1:
            raise NotImplementedError(
                "sorting by multiple columns is not supported yet")

        if self._table.num_rows == 0:
            # If the pyarrow table is empty we may not have schema
            # so calling sort_indices() will raise an error.
            return [
                pyarrow.Table.from_pydict({})
                for _ in range(len(boundaries) + 1)
            ]

        import pyarrow.compute as pac

        indices = pac.sort_indices(self._table, sort_keys=key)
        table = self._table.take(indices)
        if len(boundaries) == 0:
            return [table]

        # For each boundary value, count the number of items that are less
        # than it. Since the block is sorted, these counts partition the items
        # such that boundaries[i] <= x < boundaries[i + 1] for each x in
        # partition[i]. If `descending` is true, `boundaries` would also be
        # in descending order and we only need to count the number of items
        # *greater than* the boundary value instead.
        col, _ = key[0]
        comp_fn = pac.greater if descending else pac.less

        # TODO(ekl) this is O(n^2) but in practice it's much faster than the
        # O(n) algorithm, could be optimized.
        boundary_indices = [
            pac.sum(comp_fn(table[col], b)).as_py() for b in boundaries
        ]
        ### Compute the boundary indices in O(n) time via scan.  # noqa
        # boundary_indices = []
        # remaining = boundaries.copy()
        # values = table[col]
        # for i, x in enumerate(values):
        #     while remaining and not comp_fn(x, remaining[0]).as_py():
        #         remaining.pop(0)
        #         boundary_indices.append(i)
        # for _ in remaining:
        #     boundary_indices.append(len(values))

        ret = []
        prev_i = 0
        for i in boundary_indices:
            # Slices need to be copied to avoid including the base table
            # during serialization.
            ret.append(_copy_table(table.slice(prev_i, i - prev_i)))
            prev_i = i
        ret.append(_copy_table(table.slice(prev_i)))
        return ret
Beispiel #6
0
def test_sort_indices_table():
    table = pa.table({"a": [1, 1, 0], "b": [1, 0, 1]})

    result = pc.sort_indices(table, sort_keys=[("a", "ascending")])
    assert result.to_pylist() == [2, 0, 1]

    result = pc.sort_indices(table,
                             sort_keys=[("a", "ascending"),
                                        ("b", "ascending")])
    assert result.to_pylist() == [2, 1, 0]

    with pytest.raises(ValueError, match="Must specify one or more sort keys"):
        pc.sort_indices(table)

    with pytest.raises(ValueError, match="Nonexistent sort key column"):
        pc.sort_indices(table, sort_keys=[("unknown", "ascending")])

    with pytest.raises(ValueError, match="not a valid order"):
        pc.sort_indices(table, sort_keys=[("a", "nonscending")])
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"])

# truncate the title after 101 characters (matching display logic)
truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="")
table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title)

# ensure all dictionaries in the file use the same key/value mappings
table = table.unify_dictionaries()

# filter out non-numeric dates (e.g. null, "1850-1853")
# matches the hack in index.js:37
mask = pc.invert(pc.is_null(table.column("date")))
table = table.filter(mask)

# sorting by the date improves the loading aesthetics
# comment this out to exactly match the original appearance
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = pc.take(table, indices)

# after sorting replace ix with an accurate row index
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32()))

temp_path.unlink()

local = fs.LocalFileSystem()

with local.open_output_stream(str(target_path)) as file:
    with pa.RecordBatchStreamWriter(file, table.schema) as writer:
        writer.write_table(table, 10000)
def fill_gaps(
        arrow_df: pa.Table,
        gaps: List[Tuple[float, float]],
        sample_interval_micros: float,
        copy: bool = False) -> Tuple[pa.Table, List[Tuple[float, float]]]:
    """
    fills gaps in the table with np.nan or interpolated values by interpolating timestamps based on the
    calculated sample interval

    :param arrow_df: pyarrow table with data.  first column is "timestamps"
    :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps
    :param sample_interval_micros: known sample interval of the data points
    :param copy: if True, copy the data points, otherwise interpolate from edges, default False
    :return: table without gaps and the list of gaps
    """
    # extract the necessary information to compute gap size and gap timestamps
    data_time_stamps = arrow_df["timestamps"].to_numpy()
    if len(data_time_stamps) > 1:
        data_duration = data_time_stamps[-1] - data_time_stamps[0]
        expected_samples = (
            np.floor(data_duration / sample_interval_micros) +
            (1 if data_duration % sample_interval_micros >=
             sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1
        if expected_samples > len(data_time_stamps):
            if copy:
                pcm = DataPointCreationMode["COPY"]
            else:
                pcm = DataPointCreationMode["NAN"]
            # make it safe to alter the gap values
            my_gaps = check_gap_list(gaps, data_time_stamps[0],
                                     data_time_stamps[-1])
            for gap in my_gaps:
                # if timestamps are around gaps, we have to update the values
                before_start = np.argwhere(
                    [t <= gap[0] for t in data_time_stamps])
                after_end = np.argwhere(
                    [t >= gap[1] for t in data_time_stamps])
                if len(before_start) > 0:
                    before_start = before_start[-1][0]
                    # sim = gap[0] - data_time_stamps[before_start]
                    # result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm)
                    gap = (data_time_stamps[before_start], gap[1])
                else:
                    before_start = None
                if len(after_end) > 0:
                    after_end = after_end[0][0]
                    # sim = gap[1] - data_time_stamps[after_end]
                    gap = (gap[0], data_time_stamps[after_end])
                else:
                    after_end = None
                num_new_points = int(
                    (gap[1] - gap[0]) / sample_interval_micros) - 1
                if before_start is not None:
                    arrow_df = add_data_points_to_df(arrow_df, before_start,
                                                     sample_interval_micros,
                                                     num_new_points, pcm)
                elif after_end is not None:
                    arrow_df = add_data_points_to_df(arrow_df, after_end,
                                                     -sample_interval_micros,
                                                     num_new_points, pcm)
        indic = pc.sort_indices(arrow_df,
                                sort_keys=[("timestamps", "ascending")])
        return arrow_df.take(indic), gaps
    return arrow_df, gaps
def _sort_table_on_real_then_date(table: pa.Table) -> pa.Table:
    indices = pc.sort_indices(
        table, sort_keys=[("REAL", "ascending"), ("DATE", "ascending")]
    )
    sorted_table = table.take(indices)
    return sorted_table
Beispiel #10
0
def dedupe(compaction_artifact_s3_bucket: str,
           compacted_partition_locator: Dict[str, Any],
           new_compacted_partition_locator: Dict[str,
                                                 Any], object_ids: List[Any],
           sort_keys: List[Tuple[str, str]], max_records_per_index_file: int,
           max_records_per_materialized_file: int,
           num_materialize_buckets: int, dedupe_task_index: int,
           record_counts_pending_materialize,
           prev_compacted_delta_stream_position: Optional[int],
           pk_index_version: int):

    logger.info(f"Starting dedupe task...")
    # TODO: there is a risk of running out of memory here in cases of severe
    #  skew of primary key updates
    object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
    logger.info(f"Getting delta file envelope groups object refs...")
    delta_file_envelope_groups_list = ray.get(object_refs)
    hb_index_to_delta_file_envelopes_list = defaultdict(list)
    for delta_file_envelope_groups in delta_file_envelope_groups_list:
        for hb_idx in range(len(delta_file_envelope_groups)):
            dfes = delta_file_envelope_groups[hb_idx]
            if dfes is not None:
                hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
    src_file_id_to_row_indices = defaultdict(list)
    deduped_tables = []
    logger.info(f"Running {len(hb_index_to_delta_file_envelopes_list)} "
                f"dedupe rounds...")
    for hb_idx, dfe_list in hb_index_to_delta_file_envelopes_list.items():
        table = union_primary_key_indices(
            compaction_artifact_s3_bucket,
            compacted_partition_locator,
            prev_compacted_delta_stream_position,
            hb_idx,
            dfe_list,
            pk_index_version,
        )
        logger.info("Dedupe round input record count: ", len(table))

        # sort by sort keys
        if len(sort_keys):
            # TODO: convert to O(N) dedupe w/ sort keys
            sort_keys.extend([
                (sc._PARTITION_STREAM_POSITION_COLUMN_NAME, "ascending"),
                (sc._ORDERED_FILE_IDX_COLUMN_NAME, "ascending"),
            ])
            table = table.take(pc.sort_indices(table, sort_key=sort_keys))

        # drop duplicates by primary key hash column
        table = drop_duplicates_by_primary_key_hash(table)
        table = table.drop([sc._DELTA_TYPE_COLUMN_NAME])
        logger.info("Dedupe round output record count: ", len(table))

        deduped_tables.append((hb_idx, table))

        stream_position_col = sc.stream_position_column_np(table)
        file_idx_col = sc.file_index_column_np(table)
        row_idx_col = sc.record_index_column_np(table)
        is_source_col = sc.is_source_column_np(table)
        for row_idx in range(len(table)):
            src_file_id = (
                is_source_col[row_idx],
                stream_position_col[row_idx],
                file_idx_col[row_idx],
            )
            # TODO(pdames): merge contiguous record number ranges
            src_file_id_to_row_indices[src_file_id].append(
                row_idx_col[row_idx])

    logger.info(f"Finished all dedupe rounds...")
    mat_bucket_to_src_file_record_count = defaultdict(dict)
    mat_bucket_to_src_file_records = defaultdict(dict)
    for src_file_id, src_row_indices in src_file_id_to_row_indices.items():
        mat_bucket = file_id_to_mat_bucket_index(
            src_file_id,
            num_materialize_buckets,
        )
        mat_bucket_to_src_file_records[mat_bucket][src_file_id] = np.array(
            src_row_indices, )
        mat_bucket_to_src_file_record_count[mat_bucket][src_file_id] = \
            len(src_row_indices)

    mat_bucket_to_dd_idx_obj_id = {}
    object_refs = []
    for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
        object_ref = ray.put(src_file_records)
        object_refs.append(object_ref)
        mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
            dedupe_task_index,
            cloudpickle.dumps(object_ref),
        )
    logger.info(f"Count of materialize buckets with object refs: "
                f"{len(mat_bucket_to_dd_idx_obj_id)}")

    record_counts_pending_materialize.add_record_counts(
        dedupe_task_index,
        mat_bucket_to_src_file_record_count,
    )

    # wait for all dedupe tasks to reach this point before continuing
    logger.info(
        f"Waiting for all dedupe tasks to finish writing record counts...")
    finalized = False
    while not finalized:
        finalized = ray.get(
            record_counts_pending_materialize.is_finalized.remote())
        time.sleep(0.25)
    logger.info(f"Writing destination primary key index...")
    dedupe_result = write_dest_primary_key_index(
        compaction_artifact_s3_bucket,
        new_compacted_partition_locator,
        max_records_per_index_file,
        max_records_per_materialized_file,
        num_materialize_buckets,
        dedupe_task_index,
        deduped_tables,
        record_counts_pending_materialize.get_record_counts(),
        1,
    )
    logger.info(f"Finished dedupe task...")
    return mat_bucket_to_dd_idx_obj_id, object_refs, dedupe_result
Beispiel #11
0
def sort(table: "pyarrow.Table", key: "SortKeyT", descending: bool) -> "pyarrow.Table":
    import pyarrow.compute as pac

    indices = pac.sort_indices(table, sort_keys=key)
    return table.take(indices)
Beispiel #12
0
type(metadata)
print(parquet_file.schema)
group = metadata.row_group(0)
print(group)
dir(group)
vendor_col = group.column(0)
print(vendor_col)
tip_col = group.column(13)
print(tip_col)
pq.write_table(table, "202001_std.parquet", compression="ZSTD")

print(len(table["tip_amount"].unique()))

silly_table = pa.Table.from_arrays([
    table["VendorID"], table["VendorID"].take(
        pc.sort_indices(table["VendorID"]))
], ["unordered", "ordered"])

pq.write_table(silly_table, "silly.parquet")
silly = pq.ParquetFile("silly.parquet")
silly_group = silly.metadata.row_group(0)
print(silly_group.column(0))
print(silly_group.column(1))
silly_table["ordered"].unique()
tp["VendorID"].value_counts(dropna=False)

# reduce time from ms to s
# partitioned datasets
# rows groups (maybe with a single dataset)

len(tp.fare_amount.unique())
Beispiel #13
0
def materialize(
        source_partition_locator: Dict[str, Any],
        delta_staging_area: Dict[str, Any],
        mat_bucket_index: int,
        dedupe_task_idx_and_obj_id_tuples: List[Tuple[int, Any]],
        max_records_per_output_file: int,
        compacted_file_content_type: ContentType,
        deltacat_storage=unimplemented_deltacat_storage):

    logger.info(f"Starting materialize task...")
    dest_partition_locator = dsa.get_partition_locator(delta_staging_area)
    dedupe_task_idx_and_obj_ref_tuples = [
        (
            t[0],
            cloudpickle.loads(t[1]) for t in dedupe_task_idx_and_obj_id_tuples
        )
    ]
    logger.info(f"Resolved materialize task obj refs...")
    dedupe_task_indices, obj_refs = zip(
        *dedupe_task_idx_and_obj_ref_tuples
    )
    # this depends on `ray.get` result order matching input order, as per the
    # contract established in: https://github.com/ray-project/ray/pull/16763
    src_file_records_list = ray.get(obj_refs)
    all_src_file_records = defaultdict(list)
    for i in range(len(src_file_records_list)):
        dedupe_task_idx = dedupe_task_indices[i]
        src_file_records = src_file_records_list[i]
        for src_file_id, record_numbers in src_file_records.items():
            all_src_file_records[src_file_id].append(
                (record_numbers, repeat(dedupe_task_idx, len(record_numbers)))
            )
    manifest_cache = {}
    compacted_tables = []
    for src_file_id in sorted(all_src_file_records.keys()):
        record_numbers_dd_task_idx_tpl_list = all_src_file_records[src_file_id]
        record_numbers_list, dedupe_task_idx_iterator = zip(
            *record_numbers_dd_task_idx_tpl_list
        )
        is_src_partition_file = src_file_id[0]
        src_file_position = src_file_id[1]
        src_file_idx = src_file_id[2]
        src_file_partition_locator = source_partition_locator \
            if is_src_partition_file \
            else dest_partition_locator
        delta_locator = dl.of(
            src_file_partition_locator,
            src_file_position,
        )
        dl_hexdigest = dl.hexdigest(delta_locator)
        manifest = manifest_cache.setdefault(
            dl_hexdigest,
            deltacat_storage.get_manifest(delta_locator),
        )
        pa_table = deltacat_storage.download_manifest_entry(
            delta_locator,
            manifest,
            src_file_idx,
        )
        mask_pylist = list(repeat(False, len(pa_table)))
        record_numbers = chain.from_iterable(record_numbers_list)
        for record_number in record_numbers:
            mask_pylist[record_number] = True
        mask = pa.array(mask_pylist)
        compacted_table = pa_table.filter(mask)

        # appending, sorting, taking, and dropping has 2-3X latency of a
        # single filter on average, and thus provides much better performance
        # than repeatedly filtering the table in dedupe task index order
        dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iterator)
        compacted_table = sc.append_dedupe_task_idx_col(
            compacted_table,
            dedupe_task_indices,
        )
        pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")]
        compacted_table = compacted_table.take(
            pc.sort_indices(compacted_table, sort_keys=pa_sort_keys),
        )
        compacted_table = compacted_table.drop(
            [sc._DEDUPE_TASK_IDX_COLUMN_NAME]
        )
        compacted_tables.append(compacted_table)

    # TODO (pdames): save memory by writing parquet files eagerly whenever
    #  len(compacted_table) >= max_records_per_output_file
    compacted_table = pa.concat_tables(compacted_tables)
    delta_manifest = deltacat_storage.stage_delta(
        compacted_table,
        delta_staging_area,
        max_records_per_entry=max_records_per_output_file,
        content_type=compacted_file_content_type,
    )

    manifest = dm.get_manifest(delta_manifest)
    manifest_records = rsmm.get_record_count(manifest)
    assert(manifest_records == len(compacted_table),
           f"Unexpected Error: Materialized delta manifest record count "
           f"({manifest_records}) does not equal compacted table record count "
           f"({len(compacted_table)})")

    return mr.of(
        delta_manifest,
        mat_bucket_index,
        len(rsm.get_entries(manifest)),
        compacted_table.nbytes,
        rsmm.get_content_length(manifest),
        len(compacted_table),
    )