def _get_partition_keys_from_partitions(partitions): if len(partitions): part = next(iter(partitions.values())) files_dct = part["files"] if files_dct: key = next(iter(files_dct.values())) _, _, indices, _ = decode_key(key) if indices: return [tup[0] for tup in indices] return None
def _partition_data(self, partition_on): existing_indices, base_label = decode_key("uuid/table/{}".format( self.label))[2:] dct = dict() df = self.data # Check that data sizes do not change. This might happen if the # groupby below drops data, e.g. nulls size_after = 0 size_before = len(df) # Implementation from pyarrow # See https://github.com/apache/arrow/blob/b33dfd9c6bd800308bb1619b237dbf24dea159be/python/pyarrow/parquet.py#L1030 # noqa: E501 # column sanity checks data_cols = set(df.columns).difference(partition_on) missing_po_cols = set(partition_on).difference(df.columns) if missing_po_cols: raise ValueError("Partition column(s) missing: {}".format( ", ".join(sorted(missing_po_cols)))) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") # To be aligned with open source tooling we drop the index columns and recreate # them upon reading as it is done by fastparquet and pyarrow partition_keys = [df[col] for col in partition_on] # # The handling of empty dfs is not part of the arrow implementation # if df.empty: # return {} data_df = df.drop(partition_on, axis="columns") for value, group in data_df.groupby(by=partition_keys, sort=False): partitioning_info = [] if pd.api.types.is_scalar(value): value = [value] if existing_indices: partitioning_info.extend(quote_indices(existing_indices)) partitioning_info.extend(quote_indices(zip(partition_on, value))) partitioning_info.append(base_label) new_label = "/".join(partitioning_info) if new_label not in dct: dct[new_label] = {} dct[new_label] = group size_after += len(group) if size_before != size_after: raise ValueError( f"Original dataframe size ({size_before} rows) does not " f"match new dataframe size ({size_after} rows). " f"Hint: you may see this if you are trying to use `partition_on` on a column with null values." ) return dct
def _load_partitions_from_filenames(store, storage_keys, metadata_version): partitions = defaultdict(_get_empty_partition) depth_indices = None for key in storage_keys: dataset_uuid, table, indices, file_ = decode_key(key) if file_ is not None and file_.endswith(PARQUET_FILE_SUFFIX): # valid key example: # <uuid>/<table>/<column_0>=<value_0>/.../<column_n>=<value_n>/part_label.parquet depth_indices = _check_index_depth(indices, depth_indices) partition_label = _get_partition_label(indices, file_, metadata_version) partitions[partition_label]["files"][table] = key return partitions
def _construct_dynamic_index_from_partitions( partitions: Dict[str, Partition], table_meta: TableMetaType, default_dtype: pa.DataType, partition_keys: List[str], ) -> Dict[str, PartitionIndex]: if len(partitions) == 0: return _empty_partition_indices(partition_keys, table_meta, default_dtype) def _get_files(part): if isinstance(part, dict): return part["files"] else: return part.files # We exploit the fact that all tables are partitioned equally. first_partition = next( iter(partitions.values()) ) # partitions is NOT empty here, see check above first_partition_files = _get_files(first_partition) if not first_partition_files: return _empty_partition_indices(partition_keys, table_meta, default_dtype) key_table = next(iter(first_partition_files.keys())) storage_keys = ( (key, _get_files(part)[key_table]) for key, part in partitions.items() ) _key_indices: Dict[str, Dict[str, Set[str]]] = defaultdict(_get_empty_index) depth_indices = None for partition_label, key in storage_keys: _, _, indices, file_ = decode_key(key) if ( file_ is not None and key.endswith(PARQUET_FILE_SUFFIX) and not key.endswith(EXTERNAL_INDEX_SUFFIX) ): depth_indices = _check_index_depth(indices, depth_indices) for column, value in indices: _key_indices[column][value].add(partition_label) new_indices = {} for col, index_dct in _key_indices.items(): arrow_type = _get_type_from_meta(table_meta, col, default_dtype) # convert defaultdicts into dicts new_indices[col] = PartitionIndex( column=col, index_dct={k1: list(v1) for k1, v1 in index_dct.items()}, dtype=arrow_type, ) return new_indices
def load_dataframes( self, store: KeyValueStore, columns: Optional[Sequence[str]] = None, predicate_pushdown_to_io: bool = True, categoricals: Optional[Sequence[str]] = None, dates_as_object: bool = True, predicates: PredicatesType = None, ) -> "MetaPartition": """ Load the dataframes of the partitions from store into memory. Parameters ---------- tables If a list is supplied, only the given tables of the partition are loaded. If the given table does not exist it is ignored. Examples .. code:: >>> part = MetaPartition( ... label='part_label' ... files={ ... 'core': 'core_key_in_store', ... 'helper': 'helper_key_in_store' ... } ... ) >>> part.data {} >>> part = part.load_dataframes(store, ['core']) >>> part.data { 'core': pd.DataFrame() } """ if categoricals is None: categoricals = [] if not dates_as_object: warnings.warn( "The argument `date_as_object` is set to False. This argument will be deprecated and the future behaviour will be as if the paramere was set to `True`. Please migrate your code accordingly ahead of time.", DeprecationWarning, ) LOGGER.debug("Loading internal dataframes of %s", self.label) if not self.file: # This used to raise, but the specs do not require this, so simply do a no op LOGGER.debug("Partition %s is empty and has no data.", self.label) return self predicates = _combine_predicates(predicates, self.logical_conjunction) predicates = _predicates_to_named(predicates) dataset_uuid, _, indices, _ = decode_key(self.file) # In case the columns only refer to the partition indices, we need to load at least a single column to # determine the length of the required dataframe. table_columns_to_io = columns filtered_predicates = predicates self = self.load_schema(dataset_uuid=dataset_uuid, store=store) # Filter predicates that would apply to this partition and remove the partition columns if predicates: # Check if there are predicates that match to the partition columns. # For these we need to check if the partition columns already falsify # the conditition. # # We separate these predicates into their index and their Parquet part. ( split_predicates, has_index_condition, ) = self._split_predicates_in_index_and_content(predicates) filtered_predicates = [] if has_index_condition: filtered_predicates = self._apply_partition_key_predicates( indices, split_predicates) else: filtered_predicates = [ pred.content_part for pred in split_predicates ] # Remove partition_keys from table_columns_to_io if self.partition_keys and table_columns_to_io is not None: keys_to_remove = set( self.partition_keys) & set(table_columns_to_io) # This is done to not change the ordering of the list table_columns_to_io = [ c for c in table_columns_to_io if c not in keys_to_remove ] start = time.time() df = DataFrameSerializer.restore_dataframe( key=self.file, store=store, columns=table_columns_to_io, categories=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=filtered_predicates, date_as_object=dates_as_object, ) LOGGER.debug("Loaded dataframe %s in %s seconds.", self.file, time.time() - start) # Metadata version >=4 parse the index columns and add them back to the dataframe df = self._reconstruct_index_columns( df=df, key_indices=indices, columns=columns, categories=categoricals, date_as_object=dates_as_object, ) df.columns = df.columns.map(ensure_string_type) if columns is not None: # TODO: When the write-path ensures that all partitions have the same column set, this check can be # moved before `DataFrameSerializer.restore_dataframe`. At the position of the current check we # may want to double check the columns of the loaded DF and raise an exception indicating an # inconsistent dataset state instead. missing_cols = set(columns).difference(df.columns) if missing_cols: raise ValueError( "Columns cannot be found in stored dataframe: {}".format( ", ".join(sorted(missing_cols)))) if list(df.columns) != columns: df = df.reindex(columns=columns, copy=False) return self.copy(data=df)