Example #1
0
 def _validate_dtype(self, df: ddDataFrame) -> None:
     """
     Validating setter method for self._external_file
     Checks input is of type dd.DataFrame
     """
     if not isinstance(df, ddDataFrame):
         raise DatasetError("Dask Datasets must be of type `dd.DataFrame`")
Example #2
0
    def _get(dataframe: pd.DataFrame, columns: List[str],
             split: str) -> pd.DataFrame:
        """
        Internal method to extract data subsets from a dataframe

        :param dataframe: the dataframe to subset from
        :param columns: List of columns to slice from the dataframe
        :param split: row identifiers to slice rows (in internal column mapped to `DATAFRAME_SPLIT_COLUMN`)
        """
        if split is not None:  # Return the full dataset (all splits) - already a copy
            # query automatically returns a copy wisth a weakref
            if DATAFRAME_SPLIT_COLUMN not in dataframe.columns:
                raise DatasetError(
                    "Cannot retrieve dataset split `{split}` from dataframe without `{DATAFRAME_SPLIT_COLUMN}` column"
                )
            dataframe = dataframe.query("{}=='{}'".format(
                DATAFRAME_SPLIT_COLUMN, split))

        # inplace drop extra columns
        drop_columns = [col for col in dataframe.columns if col not in columns]
        if drop_columns:
            dataframe.drop(drop_columns, axis=1, inplace=True)

        # Last check in case any of the operations created a view or weakref copy
        if (hasattr(dataframe, "_is_view") and dataframe._is_view) or (hasattr(
                dataframe, "_is_copy") and dataframe._is_copy is not None):
            dataframe = dataframe.copy()

        return dataframe
Example #3
0
    def save(self, **kwargs):
        '''
        Extend parent function with a few additional save routines
        '''
        if self.pipeline is None:
            raise DatasetError('Must set dataset pipeline before saving')

        super(BaseProcessedDataset, self).save(**kwargs)

        # Sqlalchemy updates relationship references after save so reload class
        self.pipeline.load(load_externals=False)
Example #4
0
    def build_dataframe(self) -> None:
        """
        Transform raw dataset via dataset pipeline for production ready dataset
        Overwrite this method to disable raw dataset requirement
        """
        if self.pipeline is None:
            raise DatasetError("Must set pipeline before building dataframe")

        split_names = self.pipeline.get_split_names()
        self.dataframe = dict([(split_name,
                                self.pipeline.transform(X=None,
                                                        split=split_name))
                               for split_name in split_names])
Example #5
0
 def __init__(self,
              filepath: str,
              format: str,
              reader_params: Optional[Dict] = None,
              **kwargs):
     super().__init__(**kwargs)
     if format not in DASK_READER_MAP:
         raise DatasetError(
             f"No reader configured for provided file format: {format}")
     self.config.update({
         "filepath": filepath,
         "format": format,
         "reader_params": reader_params or {},
     })
Example #6
0
    def build_dataframe(self):
        '''
        Transform raw dataset via dataset pipeline for production ready dataset
        '''
        if self.pipeline is None:
            raise DatasetError('Must set pipeline before building dataframe')

        X, y = self.pipeline.transform(X=None, return_y=True)

        if y is None:
            y = pd.DataFrame()

        self.config['label_columns'] = y.columns.tolist()
        self._external_file = pd.concat([X, y], axis=1)
Example #7
0
    def __init__(
        self,
        has_external_files: bool = True,
        label_columns: Optional[List[str]] = None,
        other_named_split_sections: Optional[Dict[str, List[str]]] = None,
        pipeline_id: Optional[Union[str, uuid.uuid4]] = None,
        **kwargs,
    ):
        """
        param label_columns: Optional list of column names to register as the "y" split section
        param other_named_split_sections: Optional map of section names to lists of column names for
            other arbitrary split columns -- must match expected consumer signatures (e.g. sample_weights)
            because passed through untouched downstream (eg sklearn.fit(**split))
        All other columns in the dataframe will automatically be referenced as "X"
        """
        # If no save patterns are set, specify a default for disk_pickled
        if "save_patterns" not in kwargs:
            kwargs["save_patterns"] = {"dataset": ["disk_pickled"]}
        super().__init__(has_external_files=has_external_files, **kwargs)

        # split sections are an optional set of inputs to register split references
        # for later use. defaults to just `X` and `y` but arbitrary inputs can
        # be passed (eg sample_weights, etc)

        # validate input
        if other_named_split_sections is None:
            other_named_split_sections = {}
        else:
            for k, v in other_named_split_sections.items():
                if not isinstance(v, (list, tuple)):
                    raise DatasetError(
                        f"Split sections must be a map of section reference (eg `y`) to list of columns. {k}: {v} passed instead"
                    )

        self.config["split_section_map"] = {
            # y maps to label columns (by default assume unsupervised so no targets)
            "y": label_columns or [],
            # arbitrary passed others
            **other_named_split_sections
            # everything else automatically becomes "X"
        }

        # initialize null pipeline reference
        self.pipeline_id = pipeline_id
Example #8
0
    def build_dataframe(self):
        '''
        Transform raw dataset via dataset pipeline for production ready dataset
        Overwrite this method to disable raw dataset requirement
        '''
        if self.pipeline is None:
            raise DatasetError('Must set pipeline before building dataframe')

        split_names = self.pipeline.get_split_names()
        splits = [self.pipeline.transform(X=None, split=split_name) for split_name in split_names]
        merged_splits = [self.merge_split(split) for split in splits]

        if splits[0].y and not self.config['label_columns']:  # Propagate old labels to new dataset
            self.config['label_columns'] = splits[0].y.columns.tolist()

        if len(merged_splits) > 1:  # Combine multiple splits
            # Join row wise - drop index in case duplicates exist
            self._external_file = pd.concat(merged_splits, axis=0, ignore_index=True)
        else:
            self._external_file = merged_splits[0]
Example #9
0
    def build_dataframe(self) -> None:
        """
        Transform raw dataset via dataset pipeline for production ready dataset
        """
        if self.pipeline is None:
            raise DatasetError("Must set pipeline before building dataframe")

        split_names = self.pipeline.get_split_names()
        splits = [
            self.pipeline.transform(X=None, split=split_name)
            for split_name in split_names
        ]
        merged_splits = [self.merge_split(split) for split in splits]

        if len(merged_splits) > 1:  # Combine multiple splits
            # Join row wise - drop index in case duplicates exist
            self.dataframe = self.concatenate_dataframes(
                merged_splits, split_names)
        else:
            self.dataframe = merged_splits[0]
Example #10
0
    def _get(dataframe: ddDataFrame, columns: List[str],
             split: str) -> ddDataFrame:
        """
        Internal method to extract data subsets from a dataframe

        :param dataframe: the dataframe to subset from
        :param columns: List of columns to slice from the dataframe
        :param split: row identifiers to slice rows (in internal column mapped to `DATAFRAME_SPLIT_COLUMN`)
        """
        if split is not None:  # Return the full dataset (all splits) - already a copy
            # query automatically returns a copy wisth a weakref
            if DATAFRAME_SPLIT_COLUMN not in dataframe.columns:
                raise DatasetError(
                    f"Cannot retrieve dataset split `{split}` from dataframe without `{DATAFRAME_SPLIT_COLUMN}` column"
                )
            dataframe = dataframe.query("{}=='{}'".format(
                DATAFRAME_SPLIT_COLUMN, split))

        # drop extra columns
        drop_columns = [col for col in dataframe.columns if col not in columns]
        if drop_columns:
            dataframe = dataframe.drop(drop_columns, axis=1)

        return dataframe
Example #11
0
    def build_dataframe(self):
        '''
        Transform raw dataset via dataset pipeline for production ready dataset
        Overwrite this method to disable raw dataset requirement
        '''
        if self.pipeline is None:
            raise DatasetError('Must set pipeline before building dataframe')

        split_names = self.pipeline.get_split_names()
        splits = [(split_name, self.pipeline.transform(X=None, split=split_name)) for split_name in split_names]

        if splits[0][1].y and not self.config['label_columns']:
            # If there is a Y, explicitly label it
            self.config['label_columns'] = ['y']

        y_label = self.config['label_columns'][0]

        # Assume propagating logic since there is no clear way to join
        self._external_file = {
            split_name: {
                'X': split.X,
                y_label: split.y
            } for split_name, split in splits
        }