def _empty_partition_indices( partition_keys: List[str], table_meta: TableMetaType, default_dtype: pa.DataType ): indices = {} for col in partition_keys: arrow_type = _get_type_from_meta(table_meta, col, default_dtype) indices[col] = PartitionIndex(column=col, index_dct={}, dtype=arrow_type) return indices
def _empty_partition_indices( partition_keys: List[str], schema: Optional[SchemaWrapper], default_dtype: pa.DataType, ): indices = {} for col in partition_keys: arrow_type = _get_type_from_meta(schema, col, default_dtype) indices[col] = PartitionIndex(column=col, index_dct={}, dtype=arrow_type) return indices
def _construct_dynamic_index_from_partitions( partitions: Dict[str, Partition], table_meta: TableMetaType, default_dtype: pa.DataType, partition_keys: List[str], ) -> Dict[str, PartitionIndex]: if len(partitions) == 0: return _empty_partition_indices(partition_keys, table_meta, default_dtype) def _get_files(part): if isinstance(part, dict): return part["files"] else: return part.files # We exploit the fact that all tables are partitioned equally. first_partition = next( iter(partitions.values()) ) # partitions is NOT empty here, see check above first_partition_files = _get_files(first_partition) if not first_partition_files: return _empty_partition_indices(partition_keys, table_meta, default_dtype) key_table = next(iter(first_partition_files.keys())) storage_keys = ( (key, _get_files(part)[key_table]) for key, part in partitions.items() ) _key_indices: Dict[str, Dict[str, Set[str]]] = defaultdict(_get_empty_index) depth_indices = None for partition_label, key in storage_keys: _, _, indices, file_ = decode_key(key) if ( file_ is not None and key.endswith(PARQUET_FILE_SUFFIX) and not key.endswith(EXTERNAL_INDEX_SUFFIX) ): depth_indices = _check_index_depth(indices, depth_indices) for column, value in indices: _key_indices[column][value].add(partition_label) new_indices = {} for col, index_dct in _key_indices.items(): arrow_type = _get_type_from_meta(table_meta, col, default_dtype) # convert defaultdicts into dicts new_indices[col] = PartitionIndex( column=col, index_dct={k1: list(v1) for k1, v1 in index_dct.items()}, dtype=arrow_type, ) return new_indices