Example #1
0
def _empty_partition_indices(
    partition_keys: List[str], table_meta: TableMetaType, default_dtype: pa.DataType
):
    indices = {}
    for col in partition_keys:
        arrow_type = _get_type_from_meta(table_meta, col, default_dtype)
        indices[col] = PartitionIndex(column=col, index_dct={}, dtype=arrow_type)
    return indices
def _empty_partition_indices(
    partition_keys: List[str],
    schema: Optional[SchemaWrapper],
    default_dtype: pa.DataType,
):
    indices = {}
    for col in partition_keys:
        arrow_type = _get_type_from_meta(schema, col, default_dtype)
        indices[col] = PartitionIndex(column=col, index_dct={}, dtype=arrow_type)
    return indices
Example #3
0
def _construct_dynamic_index_from_partitions(
    partitions: Dict[str, Partition],
    table_meta: TableMetaType,
    default_dtype: pa.DataType,
    partition_keys: List[str],
) -> Dict[str, PartitionIndex]:
    if len(partitions) == 0:
        return _empty_partition_indices(partition_keys, table_meta, default_dtype)

    def _get_files(part):
        if isinstance(part, dict):
            return part["files"]
        else:
            return part.files

    # We exploit the fact that all tables are partitioned equally.
    first_partition = next(
        iter(partitions.values())
    )  # partitions is NOT empty here, see check above
    first_partition_files = _get_files(first_partition)
    if not first_partition_files:
        return _empty_partition_indices(partition_keys, table_meta, default_dtype)
    key_table = next(iter(first_partition_files.keys()))
    storage_keys = (
        (key, _get_files(part)[key_table]) for key, part in partitions.items()
    )

    _key_indices: Dict[str, Dict[str, Set[str]]] = defaultdict(_get_empty_index)
    depth_indices = None
    for partition_label, key in storage_keys:
        _, _, indices, file_ = decode_key(key)
        if (
            file_ is not None
            and key.endswith(PARQUET_FILE_SUFFIX)
            and not key.endswith(EXTERNAL_INDEX_SUFFIX)
        ):
            depth_indices = _check_index_depth(indices, depth_indices)
            for column, value in indices:
                _key_indices[column][value].add(partition_label)
    new_indices = {}
    for col, index_dct in _key_indices.items():
        arrow_type = _get_type_from_meta(table_meta, col, default_dtype)

        # convert defaultdicts into dicts
        new_indices[col] = PartitionIndex(
            column=col,
            index_dct={k1: list(v1) for k1, v1 in index_dct.items()},
            dtype=arrow_type,
        )
    return new_indices