def transform(self, X=None, y=None, dataset=None): if self.fitOnly and getattr(self, 'features', None): print('fitOnly transformer') return dataset if self.designDap: dataset = access(dataset.designData, **self.designDatadap) elif self.targetDap: dataset = access(dataset.targetData, **self.targetDatadap) elif self.datasetDap: dataset = access(dataset, **self.datasetDap) return dataset
def test_access(o): assert helpers.access(o, []) is o assert helpers.access(o, ['']) is o assert helpers.access(o, '') is o assert helpers.access(o, ['a']) == 1 assert helpers.access(o, method='v') == 'w' assert helpers.access(o, method='v', methodArgs=(False, )) == 'u' assert helpers.access(o, method='v', methodKwargs=dict(on=False)) == 'u' assert helpers.access(o, ['a'], cb=cb) == 2 assert helpers.access(o, ['a'], cb=cb, cbArgs=(True, )) == -2 assert helpers.access(o, ['a'], cb=cb, cbKwargs=dict(flip=True)) == -2
def fit(self, dataset=None, target=None, primaryKey=None, groupDap=None, **kwargs): """ fit fold => finds and store values for each set Args: dataset (donatello.components.dataset): dataset to fit on target (str): str name of target field to separate primaryKey (str): key for primary field (if dataset.data \ is dict not df) groupDap (dict): payload to specify groups through access Returns: object: self """ df = dataset.designData if not primaryKey else dataset.designData[ primaryKey] groups = access(df, **groupDap) if groupDap else None self.indices = list( self.split(df, dataset.targetData, groups=groups, **kwargs)) values = groups if groups is not None else df.index.to_series() self.ids = [(values.iloc[trainIndices].values, values.iloc[testIndices].values) for (trainIndices, testIndices) in self.indices] return self
def extract_features(wrapped, instance, args, kwargs): """ Record the column names and/or keys of the outgoing dataset after a function call if not already attached to instance """ dataset = find_value(wrapped, args, kwargs, 'dataset') X = find_value(wrapped, args, kwargs, 'X') initial = dataset.designData if (dataset is not None) else X if ( X is not None) else None index = initial.index if hasattr(initial, 'index') else None result = wrapped(*args, **kwargs) df = result.designData if isinstance(result, Dataset) else result postFit = not getattr(instance, 'features', None) if postFit and df is not None: features = df.columns.tolist() if hasattr( df, 'columns') else list(instance.get_feature_names()) if ( hasattr(instance, 'get_feature_names') and instance.get_feature_names()) else instance.fields instance.features = features else: features = getattr(instance, 'features', []) df = df.reindex(features) if isinstance( df, pd.DataFrame) else pd.DataFrame(df, columns=features, index=index) if postFit: instance.featureDtypes = access(df, ['dtypes'], method='to_dict', slicers=(), errors='ignore') return result
def apply(self, node, data, method, **kwargs): """ Apply a method through the graph terminating at a node Args: node (str): name of terminal node data (data.Dataset): data to process method (str): string name of method to call through nodes Returns: data.Dataset: transformed data """ parents = tuple(self.predecessors(node)) if parents: output = [ access(self.edge_exec(parent, node), method=method, methodKwargs=dict( dataset=self.apply(parent, data, method))) for parent in parents ] else: output = [data] dataset = self.node_exec(node).combine(output) sig = funcsigs.signature(getattr(self.node_exec(node), method)) if 'dataset' in sig.parameters: payload = {'dataset': dataset} else: payload = {'X': dataset.designData} payload.update({'y': dataset.targetData }) if 'y' in sig.parameters else None payload.update(kwargs) information = access(self.node_exec(node), method=method, methodKwargs=payload) return information
def extract_fields(wrapped, instance, args, kwargs): """ Record the column names and/or keys of the incoming dataset before a function call """ result = wrapped(*args, **kwargs) instance.features = None instance.isFit = True dataset = find_value(wrapped, args, kwargs, 'dataset') X = find_value(wrapped, args, kwargs, 'X') df = dataset.designData if (dataset is not None) else X if ( X is not None) else None if df is not None: instance.fields = access(df, errors='ignore', cb=list) # Need dfs to find schema instance.fieldDtypes = access(df, ['dtypes'], method='to_dict', errors='ignore', slicers=()) return result
def subset(self, subset='train'): """ Create a new `donatello.components.data.Dataset` with a (sub)set of the dataset's data. Either by referencing by name (train, test) or passing a payload for :py:func:`donatello.utils.helpers.access` Args: subset (str|dict): attribute to select Returns: Dataset: with same params as current dataset """ if isinstance(subset, str): subset = subset.capitalize() attrs = [ '{}{}'.format(attr, subset) for attr in ['design', 'target'] ] X, y = tuple(getattr(self, attr) for attr in attrs) else: X = access(self.designData, **subset) y = access(self.targetData, **subset) return self.with_params(X=X, y=y)
def process(self, dataset, node, method, **kwargs): """ Evalute method through nodes that accept dataset inputs """ parents = tuple(self.predecessors(node)) if parents: upstreams = [ self.apply(parent, dataset, method, **kwargs) for parent in parents ] datas = [ access(self.edge_exec(parent, node), method=method, methodKwargs=dict(dataset=upstream)) for parent, upstream in zip(parents, upstreams) ] dataset = self.node_exec(node).combine(datas) return dataset
def grid_search(self, dataset=None, gridSearch=True, paramGrid=None, searchKwargs=None): """ Grid search over hyperparameter space """ if paramGrid and gridSearch: print('grid searching') self.gridSearch = GridSearchCV(estimator=self, param_grid=paramGrid, **searchKwargs) groups = access(dataset.designData, ** dataset.groupDap) if dataset.groupDap else None self.gridSearch.fit(X=dataset.designData, y=dataset.targetData, groups=groups, gridSearch=False) self.set_params(**self.gridSearch.best_params_)