def transform(self, X=None, y=None, dataset=None):

        if self.fitOnly and getattr(self, 'features', None):
            print('fitOnly transformer')
            return dataset

        if self.designDap:
            dataset = access(dataset.designData, **self.designDatadap)
        elif self.targetDap:
            dataset = access(dataset.targetData, **self.targetDatadap)
        elif self.datasetDap:
            dataset = access(dataset, **self.datasetDap)
        return dataset
def test_access(o):
    assert helpers.access(o, []) is o
    assert helpers.access(o, ['']) is o
    assert helpers.access(o, '') is o
    assert helpers.access(o, ['a']) == 1

    assert helpers.access(o, method='v') == 'w'
    assert helpers.access(o, method='v', methodArgs=(False, )) == 'u'
    assert helpers.access(o, method='v', methodKwargs=dict(on=False)) == 'u'

    assert helpers.access(o, ['a'], cb=cb) == 2
    assert helpers.access(o, ['a'], cb=cb, cbArgs=(True, )) == -2
    assert helpers.access(o, ['a'], cb=cb, cbKwargs=dict(flip=True)) == -2
Exemple #3
0
    def fit(self,
            dataset=None,
            target=None,
            primaryKey=None,
            groupDap=None,
            **kwargs):
        """
        fit fold  => finds and store values for each set

        Args:
            dataset (donatello.components.dataset): dataset to fit on
            target (str): str name of target field to separate
            primaryKey (str): key for primary field (if dataset.data \
                is dict not df)
            groupDap (dict): payload to specify groups through access

        Returns:
            object: self
        """
        df = dataset.designData if not primaryKey else dataset.designData[
            primaryKey]

        groups = access(df, **groupDap) if groupDap else None

        self.indices = list(
            self.split(df, dataset.targetData, groups=groups, **kwargs))

        values = groups if groups is not None else df.index.to_series()

        self.ids = [(values.iloc[trainIndices].values,
                     values.iloc[testIndices].values)
                    for (trainIndices, testIndices) in self.indices]
        return self
Exemple #4
0
def extract_features(wrapped, instance, args, kwargs):
    """
    Record the column names and/or keys of the outgoing dataset
    after a function call if not already attached to instance
    """
    dataset = find_value(wrapped, args, kwargs, 'dataset')
    X = find_value(wrapped, args, kwargs, 'X')
    initial = dataset.designData if (dataset is not None) else X if (
        X is not None) else None
    index = initial.index if hasattr(initial, 'index') else None

    result = wrapped(*args, **kwargs)
    df = result.designData if isinstance(result, Dataset) else result

    postFit = not getattr(instance, 'features', None)
    if postFit and df is not None:
        features = df.columns.tolist() if hasattr(
            df, 'columns') else list(instance.get_feature_names()) if (
                hasattr(instance, 'get_feature_names')
                and instance.get_feature_names()) else instance.fields

        instance.features = features

    else:
        features = getattr(instance, 'features', [])
    df = df.reindex(features) if isinstance(
        df, pd.DataFrame) else pd.DataFrame(df, columns=features, index=index)

    if postFit:
        instance.featureDtypes = access(df, ['dtypes'],
                                        method='to_dict',
                                        slicers=(),
                                        errors='ignore')
    return result
    def apply(self, node, data, method, **kwargs):
        """
        Apply a method through the graph terminating at a node

        Args:
            node (str): name of terminal node
            data (data.Dataset): data to process
            method (str): string name of method to call through nodes

        Returns:
            data.Dataset: transformed data
        """
        parents = tuple(self.predecessors(node))
        if parents:
            output = [
                access(self.edge_exec(parent, node),
                       method=method,
                       methodKwargs=dict(
                           dataset=self.apply(parent, data, method)))
                for parent in parents
            ]
        else:
            output = [data]

        dataset = self.node_exec(node).combine(output)

        sig = funcsigs.signature(getattr(self.node_exec(node), method))
        if 'dataset' in sig.parameters:
            payload = {'dataset': dataset}
        else:
            payload = {'X': dataset.designData}
            payload.update({'y': dataset.targetData
                            }) if 'y' in sig.parameters else None
            payload.update(kwargs)
        information = access(self.node_exec(node),
                             method=method,
                             methodKwargs=payload)

        return information
Exemple #6
0
def extract_fields(wrapped, instance, args, kwargs):
    """
    Record the column names and/or keys of the incoming dataset
    before a function call
    """
    result = wrapped(*args, **kwargs)
    instance.features = None
    instance.isFit = True

    dataset = find_value(wrapped, args, kwargs, 'dataset')
    X = find_value(wrapped, args, kwargs, 'X')
    df = dataset.designData if (dataset is not None) else X if (
        X is not None) else None

    if df is not None:
        instance.fields = access(df, errors='ignore', cb=list)
        # Need dfs to find schema
        instance.fieldDtypes = access(df, ['dtypes'],
                                      method='to_dict',
                                      errors='ignore',
                                      slicers=())

    return result
Exemple #7
0
    def subset(self, subset='train'):
        """
        Create a new `donatello.components.data.Dataset`
        with a (sub)set of the dataset's data. Either
        by referencing by name (train, test) or passing
        a payload for :py:func:`donatello.utils.helpers.access`

        Args:
            subset (str|dict): attribute to select

        Returns:
            Dataset: with same params as current dataset
        """
        if isinstance(subset, str):
            subset = subset.capitalize()
            attrs = [
                '{}{}'.format(attr, subset) for attr in ['design', 'target']
            ]
            X, y = tuple(getattr(self, attr) for attr in attrs)
        else:
            X = access(self.designData, **subset)
            y = access(self.targetData, **subset)
        return self.with_params(X=X, y=y)
    def process(self, dataset, node, method, **kwargs):
        """
        Evalute method through nodes that accept dataset inputs
        """
        parents = tuple(self.predecessors(node))
        if parents:
            upstreams = [
                self.apply(parent, dataset, method, **kwargs)
                for parent in parents
            ]
            datas = [
                access(self.edge_exec(parent, node),
                       method=method,
                       methodKwargs=dict(dataset=upstream))
                for parent, upstream in zip(parents, upstreams)
            ]

            dataset = self.node_exec(node).combine(datas)
        return dataset
Exemple #9
0
    def grid_search(self,
                    dataset=None,
                    gridSearch=True,
                    paramGrid=None,
                    searchKwargs=None):
        """
        Grid search over hyperparameter space
        """
        if paramGrid and gridSearch:
            print('grid searching')
            self.gridSearch = GridSearchCV(estimator=self,
                                           param_grid=paramGrid,
                                           **searchKwargs)

            groups = access(dataset.designData, **
                            dataset.groupDap) if dataset.groupDap else None

            self.gridSearch.fit(X=dataset.designData,
                                y=dataset.targetData,
                                groups=groups,
                                gridSearch=False)

            self.set_params(**self.gridSearch.best_params_)