Ejemplo n.º 1
0
def _download_manifest_entries_in_order(
        manifest: Dict[str, Any],
        token_holder: Optional[Dict[str, Any]] = None,
        table_type: TableType = TableType.PYARROW,
        file_reader_kwargs: Optional[Dict[str, Any]] = None) \
        -> List[Union[pa.Table, pd.DataFrame, np.ndarray]]:

    return [
        download_manifest_entry(e, token_holder, table_type,
                                file_reader_kwargs)
        for e in rsm.get_entries(manifest)
    ]
Ejemplo n.º 2
0
def from_delta_manifest(delta_manifest: Dict[str, Any]) -> Dict[str, Any]:
    """
    Returns an annotated delta manifest built from the input delta manifest,
    which saves all delta manifest properties for each manifest entry. All
    operations performed on the annotated delta manifest by this module will
    preserve a mapping back to the original delta manifest entry indices and
    properties.
    """
    delta_manifest_annotated = {}
    delta_manifest_annotated.update(delta_manifest)
    entries = rsm.get_entries(dm.get_manifest(delta_manifest))
    annotations = []
    if entries:
        dtype = dm.get_delta_type(delta_manifest)
        pos = dl.get_stream_position(dm.get_delta_locator(delta_manifest))
        annotations = [_annotation(i, dtype, pos) for i in range(len(entries))]
    set_annotations(delta_manifest_annotated, annotations)
    return delta_manifest_annotated
Ejemplo n.º 3
0
def _download_manifest_entries_parallel(
        manifest: Dict[str, Any],
        token_holder: Optional[Dict[str, Any]] = None,
        table_type: TableType = TableType.PYARROW,
        max_parallelism: int = 1,
        file_reader_kwargs: Optional[Dict[str, Any]] = None) \
        -> List[Union[pa.Table, pd.DataFrame, np.ndarray]]:

    tables = []
    pool = multiprocessing.Pool(max_parallelism)
    downloader = partial(
        download_manifest_entry,
        token_holder=token_holder,
        table_type=table_type,
        file_reader_kwargs=file_reader_kwargs,
    )
    for table in pool.map(downloader, [e for e in rsm.get_entries(manifest)]):
        tables.append(table)
    return tables
Ejemplo n.º 4
0
def size_limited_groups(annotated_delta_manifests: List[Dict[str, Any]],
                        min_sublist_bytes) -> List[Dict[str, Any]]:
    """
    Simple greedy algorithm to group 1 or more annotated delta manifests into
    size-limited annotated delta manifests. All ordered entries in the input
    annotated delta manifests are appended to an annotated delta manifest group
    until group_size_bytes >= min_sublist_bytes, then a new group is started.
    Note that byte size is measured in terms of manifest entry content length,
    which is expected to be equal to the number of bytes at rest in S3 for the
    associated object. Returns the list of annotated delta manifest groups.
    """
    groups = []
    dma_group = {}
    dma_group_bytes = 0
    dma_group_entry_count = 0
    for src_dma in annotated_delta_manifests:
        src_dma_annotations = get_annotations(src_dma)
        src_dma_entries = rsm.get_entries(src_dma)
        assert (
            len(src_dma_annotations) == len(src_dma_entries),
            f"Unexpected Error: Length of delta manifest annotations "
            f"({len(src_dma_annotations)}) doesn't mach the length of delta "
            f"manifest entries ({len(src_dma_entries)}).")
        for i in range(len(src_dma_entries)):
            src_entry = src_dma_entries[i]
            _append_annotated_entry(src_dma, dma_group, src_entry,
                                    src_dma_annotations[i])
            # TODO: Fetch s3_obj["Size"] if entry content length undefined?
            dma_group_bytes += rsmm.get_content_length(
                rsme.get_meta(src_entry))
            dma_group_entry_count += 1
            if dma_group_bytes >= min_sublist_bytes:
                logger.info(
                    f"Appending group of {dma_group_entry_count} elements and "
                    f"{dma_group_bytes} bytes.")
                groups.append(dma_group)
                dma_group = {}
                dma_group_bytes = 0
                dma_group_entry_count = 0
    if dma_group:
        groups.append(dma_group)
    return groups
Ejemplo n.º 5
0
def _append_annotated_entry(src_indexed_delta_manifest: Dict[str, Any],
                            dst_indexed_delta_manifest: Dict[str, Any],
                            src_entry: Dict[str, Any],
                            src_annotation: Tuple[int, Optional[DeltaType],
                                                  Optional[int]]):

    if not dst_indexed_delta_manifest:
        # copy all extended properties from the source delta manifest impl
        dst_indexed_delta_manifest.update(src_indexed_delta_manifest)
        dm.set_manifest(dst_indexed_delta_manifest, rsm.of([src_entry]))
        set_annotations(dst_indexed_delta_manifest, [src_annotation])
    else:
        entries = rsm.get_entries(dm.get_manifest(dst_indexed_delta_manifest))
        src_delta_locator = dm.get_delta_locator(src_indexed_delta_manifest)
        dst_delta_locator = dm.get_delta_locator(dst_indexed_delta_manifest)
        # remove delta type and stream position if there is a conflict
        if dm.get_delta_type(src_indexed_delta_manifest) \
                != dm.get_delta_type(dst_indexed_delta_manifest):
            dm.set_delta_type(dst_indexed_delta_manifest, None)
        if dl.get_stream_position(src_delta_locator) \
                != dl.get_stream_position(dst_delta_locator):
            dl.set_stream_position(dst_delta_locator, None)
        entries.append(src_entry)
        get_annotations(dst_indexed_delta_manifest).append(src_annotation)
Ejemplo n.º 6
0
def materialize(
        source_partition_locator: Dict[str, Any],
        delta_staging_area: Dict[str, Any],
        mat_bucket_index: int,
        dedupe_task_idx_and_obj_id_tuples: List[Tuple[int, Any]],
        max_records_per_output_file: int,
        compacted_file_content_type: ContentType,
        deltacat_storage=unimplemented_deltacat_storage):

    logger.info(f"Starting materialize task...")
    dest_partition_locator = dsa.get_partition_locator(delta_staging_area)
    dedupe_task_idx_and_obj_ref_tuples = [
        (
            t[0],
            cloudpickle.loads(t[1]) for t in dedupe_task_idx_and_obj_id_tuples
        )
    ]
    logger.info(f"Resolved materialize task obj refs...")
    dedupe_task_indices, obj_refs = zip(
        *dedupe_task_idx_and_obj_ref_tuples
    )
    # this depends on `ray.get` result order matching input order, as per the
    # contract established in: https://github.com/ray-project/ray/pull/16763
    src_file_records_list = ray.get(obj_refs)
    all_src_file_records = defaultdict(list)
    for i in range(len(src_file_records_list)):
        dedupe_task_idx = dedupe_task_indices[i]
        src_file_records = src_file_records_list[i]
        for src_file_id, record_numbers in src_file_records.items():
            all_src_file_records[src_file_id].append(
                (record_numbers, repeat(dedupe_task_idx, len(record_numbers)))
            )
    manifest_cache = {}
    compacted_tables = []
    for src_file_id in sorted(all_src_file_records.keys()):
        record_numbers_dd_task_idx_tpl_list = all_src_file_records[src_file_id]
        record_numbers_list, dedupe_task_idx_iterator = zip(
            *record_numbers_dd_task_idx_tpl_list
        )
        is_src_partition_file = src_file_id[0]
        src_file_position = src_file_id[1]
        src_file_idx = src_file_id[2]
        src_file_partition_locator = source_partition_locator \
            if is_src_partition_file \
            else dest_partition_locator
        delta_locator = dl.of(
            src_file_partition_locator,
            src_file_position,
        )
        dl_hexdigest = dl.hexdigest(delta_locator)
        manifest = manifest_cache.setdefault(
            dl_hexdigest,
            deltacat_storage.get_manifest(delta_locator),
        )
        pa_table = deltacat_storage.download_manifest_entry(
            delta_locator,
            manifest,
            src_file_idx,
        )
        mask_pylist = list(repeat(False, len(pa_table)))
        record_numbers = chain.from_iterable(record_numbers_list)
        for record_number in record_numbers:
            mask_pylist[record_number] = True
        mask = pa.array(mask_pylist)
        compacted_table = pa_table.filter(mask)

        # appending, sorting, taking, and dropping has 2-3X latency of a
        # single filter on average, and thus provides much better performance
        # than repeatedly filtering the table in dedupe task index order
        dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iterator)
        compacted_table = sc.append_dedupe_task_idx_col(
            compacted_table,
            dedupe_task_indices,
        )
        pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")]
        compacted_table = compacted_table.take(
            pc.sort_indices(compacted_table, sort_keys=pa_sort_keys),
        )
        compacted_table = compacted_table.drop(
            [sc._DEDUPE_TASK_IDX_COLUMN_NAME]
        )
        compacted_tables.append(compacted_table)

    # TODO (pdames): save memory by writing parquet files eagerly whenever
    #  len(compacted_table) >= max_records_per_output_file
    compacted_table = pa.concat_tables(compacted_tables)
    delta_manifest = deltacat_storage.stage_delta(
        compacted_table,
        delta_staging_area,
        max_records_per_entry=max_records_per_output_file,
        content_type=compacted_file_content_type,
    )

    manifest = dm.get_manifest(delta_manifest)
    manifest_records = rsmm.get_record_count(manifest)
    assert(manifest_records == len(compacted_table),
           f"Unexpected Error: Materialized delta manifest record count "
           f"({manifest_records}) does not equal compacted table record count "
           f"({len(compacted_table)})")

    return mr.of(
        delta_manifest,
        mat_bucket_index,
        len(rsm.get_entries(manifest)),
        compacted_table.nbytes,
        rsmm.get_content_length(manifest),
        len(compacted_table),
    )
Ejemplo n.º 7
0
def limit_input_deltas(input_deltas: List[Dict[str, Any]],
                       cluster_resources: Dict[str, float],
                       user_hash_bucket_count: int,
                       user_hash_bucket_chunk_size: int,
                       round_completion_info: Dict[str, Any],
                       deltacat_storage=unimplemented_deltacat_storage):

    # TODO (pdames): when row counts are available in metadata, use them
    #  instead of bytes - memory consumption depends more on number of
    #  input delta records than bytes.

    # Inflation multiplier from snappy-compressed parquet to pyarrow.
    # This should be kept larger than actual average inflation multipliers.
    # Note that this is a very rough guess since actual observed pyarrow
    # inflation multiplier for snappy-compressed parquet is about 5.45X for
    # all rows, but here we're trying to guess the inflation multipler for just
    # a primary key SHA1 digest and sort key columns (which could be all columns
    # of the table in the worst case, but here we're assuming that they
    # represent no more than ~1/4th of the total table bytes)
    PYARROW_INFLATION_MULTIPLIER = 1.5

    # we assume here that we're running on a fixed-size cluster
    # this assumption could be removed, but we'd still need to know the max
    # resources we COULD get for this cluster, and the amount of memory
    # available per CPU should remain fixed across the cluster.
    worker_cpus = int(cluster_resources["CPU"])
    worker_obj_store_mem = ray_constants.from_memory_units(
        cluster_resources["object_store_memory"])
    logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
    worker_obj_store_mem_per_task = worker_obj_store_mem / worker_cpus
    logger.info(f"Worker object store memory/task: "
                f"{worker_obj_store_mem_per_task}")
    worker_task_mem = ray_constants.from_memory_units(
        cluster_resources["memory"])
    logger.info(f"Total worker memory: {worker_task_mem}")
    # TODO (pdames): ensure fixed memory per CPU in heterogenous clusters
    worker_mem_per_task = worker_task_mem / worker_cpus
    logger.info(f"Cluster worker memory/task: {worker_mem_per_task}")

    hash_bucket_count = 0
    if round_completion_info:
        hash_bucket_count = round_completion_info["hash_buckets"]
    logger.info(f"Prior hash bucket count: {hash_bucket_count}")

    if not hash_bucket_count:
        hash_bucket_count = user_hash_bucket_count
    elif user_hash_bucket_count and hash_bucket_count != user_hash_bucket_count:
        raise ValueError(f"Given hash bucket count ({user_hash_bucket_count})"
                         f"does not match the existing compacted hash bucket "
                         f"count ({hash_bucket_count}. To resolve this "
                         f"problem either omit a hash bucket count when "
                         f"running compaction or rehash your existing "
                         f"compacted dataset.")

    delta_bytes = 0
    delta_bytes_pyarrow = 0
    latest_stream_position = -1
    limited_input_delta_manifests = []
    for delta in input_deltas:
        delta_manifest = deltacat_storage.get_delta_manifest(delta)
        # TODO (pdames): ensure pyarrow object fits in per-task obj store mem
        position = dl.get_stream_position(dm.get_delta_locator(delta_manifest))
        manifest_entries = rsm.get_entries(dm.get_manifest(delta_manifest))
        for entry in manifest_entries:
            # TODO: Fetch s3_obj["Size"] if entry content length undefined?
            delta_bytes += rsmm.get_content_length(rsme.get_meta(entry))
            delta_bytes_pyarrow = delta_bytes * PYARROW_INFLATION_MULTIPLIER
            latest_stream_position = max(position, latest_stream_position)
        if delta_bytes_pyarrow > worker_obj_store_mem:
            logger.info(
                f"Input delta manifests limited to "
                f"{len(limited_input_delta_manifests)} by object store mem "
                f"({delta_bytes_pyarrow} > {worker_obj_store_mem})")
            break
        limited_input_delta_manifests.append(
            dma.from_delta_manifest(delta_manifest))

    logger.info(f"Input delta manifests to compact this round: "
                f"{len(limited_input_delta_manifests)}")
    logger.info(f"Input delta manifest bytes to compact: {delta_bytes}")
    logger.info(
        f"Latest input delta stream position: {latest_stream_position}")

    if not limited_input_delta_manifests:
        raise RuntimeError("No input deltas to compact!")

    # TODO (pdames): determine min hash buckets from size of all deltas
    #  (not just deltas for this round)
    min_hash_bucket_count = math.ceil(delta_bytes_pyarrow /
                                      worker_obj_store_mem_per_task)
    logger.info("Minimum recommended hash buckets: ", min_hash_bucket_count)

    if hash_bucket_count <= 0:
        # TODO (pdames): calc default hash buckets from table growth rate... as
        #  this stands, we don't know whether we're provisioning insufficient
        #  hash buckets for the next 5 minutes of deltas or more than enough
        #  for the next 10 years
        hash_bucket_count = min_hash_bucket_count
        logger.info(f"Default hash buckets: {hash_bucket_count}")

    if hash_bucket_count < min_hash_bucket_count:
        logger.warn(
            f"Provided hash bucket count ({hash_bucket_count}) "
            f"is less than the min recommended ({min_hash_bucket_count}). "
            f"This compaction job run may run out of memory. To resolve this "
            f"problem either specify a larger number of hash buckets when "
            f"running compaction, omit a custom hash bucket count when "
            f"running compaction, or provision workers with more task "
            f"memory per CPU.")

    hash_bucket_chunk_size = user_hash_bucket_chunk_size
    max_hash_bucket_chunk_size = math.ceil(worker_obj_store_mem_per_task /
                                           PYARROW_INFLATION_MULTIPLIER)
    logger.info(f"Max hash bucket chunk size: {max_hash_bucket_chunk_size}")
    if hash_bucket_chunk_size > max_hash_bucket_chunk_size:
        # TODO (pdames): note type of memory to increase (task or object store)
        logger.warn(
            f"Provided hash bucket chunk size "
            f"({user_hash_bucket_chunk_size}) is greater than the max "
            f"recommended ({max_hash_bucket_chunk_size}). This compaction "
            f"job may run out of memory. To resolve this problem either "
            f"specify a smaller hash bucket chunk size when running "
            f"compaction, omit a custom hash bucket chunk size when running "
            f"compaction, or provision workers with more task and object "
            f"store memory per CPU.")
    elif not hash_bucket_chunk_size:
        hash_bucket_chunk_size = math.ceil(max_hash_bucket_chunk_size)
        logger.info(
            f"Default hash bucket chunk size: {hash_bucket_chunk_size}")

    sized_delta_manifests = dma.size_limited_groups(
        limited_input_delta_manifests,
        hash_bucket_chunk_size,
    )

    logger.info(f"Hash bucket chunk size: {hash_bucket_chunk_size}")
    logger.info(f"Hash bucket count: {hash_bucket_count}")
    logger.info(f"Input delta manifest count: {len(sized_delta_manifests)}")

    return sized_delta_manifests, hash_bucket_count, latest_stream_position