def get_indices_as_dataframe( self, columns: Optional[List[str]] = None, date_as_object: bool = True, predicates: PredicatesType = None, ): """ Converts the dataset indices to a pandas dataframe and filter relevant indices by `predicates`. For a dataset with indices on columns `column_a` and `column_b` and three partitions, the dataset output may look like .. code:: column_a column_b part_1 1 A part_2 2 B part_3 3 None Parameters ---------- """ if not self.primary_indices_loaded and columns != []: # self.load_partition_indices is not inplace dm = self.load_partition_indices() else: dm = self if columns is None: columns = sorted(dm.indices.keys()) if columns == []: return pd.DataFrame(index=dm.partitions) if predicates: predicate_columns = columns_in_predicates(predicates) columns_to_scan = sorted( (predicate_columns & self.indices.keys()) | set(columns) ) dfs = ( dm._evaluate_conjunction( columns=columns_to_scan, predicates=[conjunction], date_as_object=date_as_object, ) for conjunction in predicates ) df = pd.concat(dfs) index_name = df.index.name df = ( df.loc[:, columns].reset_index().drop_duplicates().set_index(index_name) ) else: df = dm._evaluate_conjunction( columns=columns, predicates=None, date_as_object=date_as_object, ) return df
def dispatch_metapartitions_from_factory( dataset_factory, label_filter=None, concat_partitions_on_primary_index=False, predicates=None, store=None, dispatch_by=None, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: if not callable(dataset_factory) and not isinstance( dataset_factory, DatasetFactory): raise TypeError("Need to supply a dataset factory!") if dispatch_by and concat_partitions_on_primary_index: raise ValueError( "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, " "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. " "Please only provide the `dispatch_by` argument. ") if concat_partitions_on_primary_index: warnings.warn( "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. Use `dispatch_by=dataset_factory.partition_keys` to achieve the same behavior instead.", DeprecationWarning, ) dispatch_by = dataset_factory.partition_keys if dispatch_by and not set(dispatch_by).issubset( set(dataset_factory.index_columns)): raise RuntimeError( f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}" ) check_predicates(predicates) # Determine which indices need to be loaded. index_cols: Set[str] = set() if dispatch_by: index_cols |= set(dispatch_by) if predicates: predicate_cols = set(columns_in_predicates(predicates)) predicate_index_cols = predicate_cols & set( dataset_factory.index_columns) index_cols |= predicate_index_cols for col in index_cols: dataset_factory.load_index(col) base_df = dataset_factory.get_indices_as_dataframe(list(index_cols), predicates=predicates) if label_filter: base_df = base_df[base_df.index.map(label_filter)] indices_to_dispatch = { name: ix.unload() for name, ix in dataset_factory.indices.items() if isinstance(ix, ExplicitSecondaryIndex) } if dispatch_by: base_df = cast(pd.DataFrame, base_df) # Group the resulting MetaParitions by partition keys or a subset of those keys merged_partitions = base_df.groupby(by=list(dispatch_by), sort=False, as_index=False) for group_name, group in merged_partitions: if not isinstance(group_name, tuple): group_name = (group_name, ) mps = [] logical_conjunction = list( zip(dispatch_by, ["=="] * len(dispatch_by), group_name)) for label in group.index.unique(): mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], dataset_metadata=dataset_factory.metadata, indices=indices_to_dispatch, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, )) yield mps else: for part_label in base_df.index.unique(): part = dataset_factory.partitions[part_label] yield MetaPartition.from_partition( partition=part, dataset_metadata=dataset_factory.metadata, indices=indices_to_dispatch, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, )
def dispatch_metapartitions_from_factory( dataset_factory: DatasetFactory, label_filter: Optional[Callable] = None, concat_partitions_on_primary_index: bool = False, predicates: PredicatesType = None, store: Optional[StoreInput] = None, dispatch_by: Optional[List[str]] = None, dispatch_metadata: bool = False, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: """ :meta private: """ if dispatch_by is not None and concat_partitions_on_primary_index: raise ValueError( "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, " "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. " "Please only provide the `dispatch_by` argument. " ) if concat_partitions_on_primary_index: dispatch_by = dataset_factory.partition_keys if dispatch_by is not None and not set(dispatch_by).issubset( set(dataset_factory.index_columns) ): raise RuntimeError( f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}" ) check_predicates(predicates) # Determine which indices need to be loaded. index_cols: Set[str] = set() if dispatch_by: index_cols |= set(dispatch_by) if predicates: predicate_cols = set(columns_in_predicates(predicates)) predicate_index_cols = predicate_cols & set(dataset_factory.index_columns) index_cols |= predicate_index_cols for col in index_cols: dataset_factory.load_index(col) base_df = dataset_factory.get_indices_as_dataframe( list(index_cols), predicates=predicates ) if label_filter: base_df = base_df[base_df.index.map(label_filter)] indices_to_dispatch = { name: ix.unload() for name, ix in dataset_factory.indices.items() if isinstance(ix, ExplicitSecondaryIndex) } if dispatch_by is not None: base_df = cast(pd.DataFrame, base_df) if len(dispatch_by) == 0: merged_partitions = [((""), base_df)] else: # Group the resulting MetaParitions by partition keys or a subset of those keys merged_partitions = base_df.groupby( by=list(dispatch_by), sort=True, as_index=False ) for group_name, group in merged_partitions: if not isinstance(group_name, tuple): group_name = (group_name,) # type: ignore mps = [] logical_conjunction = list( zip(dispatch_by, ["=="] * len(dispatch_by), group_name) ) for label in group.index.unique(): mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], dataset_metadata=dataset_factory.metadata if dispatch_metadata else None, indices=indices_to_dispatch if dispatch_metadata else None, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, ) ) yield mps else: for part_label in base_df.index.unique(): part = dataset_factory.partitions[part_label] yield MetaPartition.from_partition( partition=part, dataset_metadata=dataset_factory.metadata if dispatch_metadata else None, indices=indices_to_dispatch if dispatch_metadata else None, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, )
def get_indices_as_dataframe( self, columns: Optional[List[str]] = None, date_as_object: bool = True, predicates: PredicatesType = None, ): """ Converts the dataset indices to a pandas dataframe. For a dataset with indices on columns `column_a` and `column_b` and three partitions, the dataset output may look like .. code:: column_a column_b part_1 1 A part_2 2 B part_3 3 None Parameters ---------- """ if columns is None: columns = sorted(self.indices.keys()) elif columns == []: return pd.DataFrame(index=self.partitions) dfs = [] columns_to_scan = columns[:] if predicates: predicate_columns = columns_in_predicates(predicates) # Don't use set logic to preserve order for col in predicate_columns: if col not in columns_to_scan and col in self.indices: columns_to_scan.append(col) for col in columns_to_scan: if col not in self.indices: if col in self.partition_keys: raise RuntimeError( "Partition indices not loaded. Please call `DatasetMetadata.load_partition_keys` first." ) raise ValueError("Index `{}` unknown.") df = pd.DataFrame(self.indices[col].as_flat_series( partitions_as_index=True, date_as_object=date_as_object, predicates=predicates, )) dfs.append(df) # start joining with the small ones sorted_dfs = sorted(dfs, key=lambda df: len(df)) result = sorted_dfs.pop(0) for df in sorted_dfs: result = result.merge(df, left_index=True, right_index=True, copy=False) if predicates: index_name = result.index.name result = (result.loc[:, columns].reset_index().drop_duplicates(). set_index(index_name)) return result else: return result
def dispatch_metapartitions_from_factory( dataset_factory: DatasetFactory, predicates: PredicatesType = None, dispatch_by: Optional[List[str]] = None, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: """ :meta private: """ if dispatch_by is not None and not set(dispatch_by).issubset( set(dataset_factory.index_columns) ): raise RuntimeError( f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}" ) check_predicates(predicates) # Determine which indices need to be loaded. index_cols: Set[str] = set() if dispatch_by: index_cols |= set(dispatch_by) if predicates: predicate_cols = set(columns_in_predicates(predicates)) predicate_index_cols = predicate_cols & set(dataset_factory.index_columns) index_cols |= predicate_index_cols for col in index_cols: dataset_factory.load_index(col) base_df = dataset_factory.get_indices_as_dataframe( list(index_cols), predicates=predicates ) if dispatch_by is not None: base_df = cast(pd.DataFrame, base_df) if len(dispatch_by) == 0: merged_partitions = [((""), base_df)] else: # Group the resulting MetaParitions by partition keys or a subset of those keys merged_partitions = base_df.groupby( by=list(dispatch_by), sort=True, as_index=False ) for group_name, group in merged_partitions: if not isinstance(group_name, tuple): group_name = (group_name,) # type: ignore mps = [] logical_conjunction = list( zip(dispatch_by, ["=="] * len(dispatch_by), group_name) ) for label in group.index.unique(): mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], metadata_version=dataset_factory.metadata_version, schema=dataset_factory.schema, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, table_name=dataset_factory.table_name, ) ) yield mps else: for part_label in base_df.index.unique(): part = dataset_factory.partitions[part_label] yield MetaPartition.from_partition( partition=part, metadata_version=dataset_factory.metadata_version, schema=dataset_factory.schema, partition_keys=dataset_factory.partition_keys, table_name=dataset_factory.table_name, )