def _validate_dtype(self, df: ddDataFrame) -> None: """ Validating setter method for self._external_file Checks input is of type dd.DataFrame """ if not isinstance(df, ddDataFrame): raise DatasetError("Dask Datasets must be of type `dd.DataFrame`")
def _get(dataframe: pd.DataFrame, columns: List[str], split: str) -> pd.DataFrame: """ Internal method to extract data subsets from a dataframe :param dataframe: the dataframe to subset from :param columns: List of columns to slice from the dataframe :param split: row identifiers to slice rows (in internal column mapped to `DATAFRAME_SPLIT_COLUMN`) """ if split is not None: # Return the full dataset (all splits) - already a copy # query automatically returns a copy wisth a weakref if DATAFRAME_SPLIT_COLUMN not in dataframe.columns: raise DatasetError( "Cannot retrieve dataset split `{split}` from dataframe without `{DATAFRAME_SPLIT_COLUMN}` column" ) dataframe = dataframe.query("{}=='{}'".format( DATAFRAME_SPLIT_COLUMN, split)) # inplace drop extra columns drop_columns = [col for col in dataframe.columns if col not in columns] if drop_columns: dataframe.drop(drop_columns, axis=1, inplace=True) # Last check in case any of the operations created a view or weakref copy if (hasattr(dataframe, "_is_view") and dataframe._is_view) or (hasattr( dataframe, "_is_copy") and dataframe._is_copy is not None): dataframe = dataframe.copy() return dataframe
def save(self, **kwargs): ''' Extend parent function with a few additional save routines ''' if self.pipeline is None: raise DatasetError('Must set dataset pipeline before saving') super(BaseProcessedDataset, self).save(**kwargs) # Sqlalchemy updates relationship references after save so reload class self.pipeline.load(load_externals=False)
def build_dataframe(self) -> None: """ Transform raw dataset via dataset pipeline for production ready dataset Overwrite this method to disable raw dataset requirement """ if self.pipeline is None: raise DatasetError("Must set pipeline before building dataframe") split_names = self.pipeline.get_split_names() self.dataframe = dict([(split_name, self.pipeline.transform(X=None, split=split_name)) for split_name in split_names])
def __init__(self, filepath: str, format: str, reader_params: Optional[Dict] = None, **kwargs): super().__init__(**kwargs) if format not in DASK_READER_MAP: raise DatasetError( f"No reader configured for provided file format: {format}") self.config.update({ "filepath": filepath, "format": format, "reader_params": reader_params or {}, })
def build_dataframe(self): ''' Transform raw dataset via dataset pipeline for production ready dataset ''' if self.pipeline is None: raise DatasetError('Must set pipeline before building dataframe') X, y = self.pipeline.transform(X=None, return_y=True) if y is None: y = pd.DataFrame() self.config['label_columns'] = y.columns.tolist() self._external_file = pd.concat([X, y], axis=1)
def __init__( self, has_external_files: bool = True, label_columns: Optional[List[str]] = None, other_named_split_sections: Optional[Dict[str, List[str]]] = None, pipeline_id: Optional[Union[str, uuid.uuid4]] = None, **kwargs, ): """ param label_columns: Optional list of column names to register as the "y" split section param other_named_split_sections: Optional map of section names to lists of column names for other arbitrary split columns -- must match expected consumer signatures (e.g. sample_weights) because passed through untouched downstream (eg sklearn.fit(**split)) All other columns in the dataframe will automatically be referenced as "X" """ # If no save patterns are set, specify a default for disk_pickled if "save_patterns" not in kwargs: kwargs["save_patterns"] = {"dataset": ["disk_pickled"]} super().__init__(has_external_files=has_external_files, **kwargs) # split sections are an optional set of inputs to register split references # for later use. defaults to just `X` and `y` but arbitrary inputs can # be passed (eg sample_weights, etc) # validate input if other_named_split_sections is None: other_named_split_sections = {} else: for k, v in other_named_split_sections.items(): if not isinstance(v, (list, tuple)): raise DatasetError( f"Split sections must be a map of section reference (eg `y`) to list of columns. {k}: {v} passed instead" ) self.config["split_section_map"] = { # y maps to label columns (by default assume unsupervised so no targets) "y": label_columns or [], # arbitrary passed others **other_named_split_sections # everything else automatically becomes "X" } # initialize null pipeline reference self.pipeline_id = pipeline_id
def build_dataframe(self): ''' Transform raw dataset via dataset pipeline for production ready dataset Overwrite this method to disable raw dataset requirement ''' if self.pipeline is None: raise DatasetError('Must set pipeline before building dataframe') split_names = self.pipeline.get_split_names() splits = [self.pipeline.transform(X=None, split=split_name) for split_name in split_names] merged_splits = [self.merge_split(split) for split in splits] if splits[0].y and not self.config['label_columns']: # Propagate old labels to new dataset self.config['label_columns'] = splits[0].y.columns.tolist() if len(merged_splits) > 1: # Combine multiple splits # Join row wise - drop index in case duplicates exist self._external_file = pd.concat(merged_splits, axis=0, ignore_index=True) else: self._external_file = merged_splits[0]
def build_dataframe(self) -> None: """ Transform raw dataset via dataset pipeline for production ready dataset """ if self.pipeline is None: raise DatasetError("Must set pipeline before building dataframe") split_names = self.pipeline.get_split_names() splits = [ self.pipeline.transform(X=None, split=split_name) for split_name in split_names ] merged_splits = [self.merge_split(split) for split in splits] if len(merged_splits) > 1: # Combine multiple splits # Join row wise - drop index in case duplicates exist self.dataframe = self.concatenate_dataframes( merged_splits, split_names) else: self.dataframe = merged_splits[0]
def _get(dataframe: ddDataFrame, columns: List[str], split: str) -> ddDataFrame: """ Internal method to extract data subsets from a dataframe :param dataframe: the dataframe to subset from :param columns: List of columns to slice from the dataframe :param split: row identifiers to slice rows (in internal column mapped to `DATAFRAME_SPLIT_COLUMN`) """ if split is not None: # Return the full dataset (all splits) - already a copy # query automatically returns a copy wisth a weakref if DATAFRAME_SPLIT_COLUMN not in dataframe.columns: raise DatasetError( f"Cannot retrieve dataset split `{split}` from dataframe without `{DATAFRAME_SPLIT_COLUMN}` column" ) dataframe = dataframe.query("{}=='{}'".format( DATAFRAME_SPLIT_COLUMN, split)) # drop extra columns drop_columns = [col for col in dataframe.columns if col not in columns] if drop_columns: dataframe = dataframe.drop(drop_columns, axis=1) return dataframe
def build_dataframe(self): ''' Transform raw dataset via dataset pipeline for production ready dataset Overwrite this method to disable raw dataset requirement ''' if self.pipeline is None: raise DatasetError('Must set pipeline before building dataframe') split_names = self.pipeline.get_split_names() splits = [(split_name, self.pipeline.transform(X=None, split=split_name)) for split_name in split_names] if splits[0][1].y and not self.config['label_columns']: # If there is a Y, explicitly label it self.config['label_columns'] = ['y'] y_label = self.config['label_columns'][0] # Assume propagating logic since there is no clear way to join self._external_file = { split_name: { 'X': split.X, y_label: split.y } for split_name, split in splits }