Esempio n. 1
0
class JoinCategoricalAsOneHot(Pipe):
    fit_requires = {'x_categorical': sf_types.PandasDataFrame(schema={})}
    transform_requires = {
        'x': sf_types.PandasDataFrame(schema={}),
        'x_categorical': sf_types.PandasDataFrame(schema={})
    }
    transform_modifies = {
        'x': sf_types.PandasDataFrame(schema={}),
        'x_categorical': sf_ops.Drop(),
    }

    fitted_parameters = {'label': object, 'one_hot': object}

    def fit(self, data: dict, parameters: dict = None):
        df = data['x_categorical'].copy()
        self['label'] = dict((column, LabelEncoder()) for column in df.columns)
        self['transformer'] = OneHotEncoder()

        for column in self['label']:
            df.loc[:, column] = self['label'][column].fit_transform(
                df.loc[:, column])
        self['transformer'].fit(df.values)

    def transform(self, data: dict):
        index = data['x_categorical'].index
        for column in self['label']:
            mode = data['x_categorical'].loc[:, column].mode()[0]

            def f(x):
                if x not in self['label'][column].classes_:
                    return mode
                else:
                    return x

            data['x_categorical'].loc[:, column] = data[
                'x_categorical'].loc[:, column].apply(f)
            data['x_categorical'].loc[:, column] = self['label'][
                column].transform(data['x_categorical'].loc[:, column])

        data['x_categorical'] = self['transformer'].transform(
            data['x_categorical'])

        df = pd.DataFrame(data['x_categorical'].toarray(), index=index)
        data['x'] = data['x'].join(df)
        del data['x_categorical']
        return data
Esempio n. 2
0
class SplitNumericCategorical(Pipe):
    fit_requires = transform_requires = {
        'x': sf_types.PandasDataFrame(schema={})
    }
    transform_modifies = {
        'x_categorical': sf_types.PandasDataFrame(schema={}),
        'x': sf_types.PandasDataFrame(schema={})
    }

    fitted_parameters = {'numeric_columns': sf_types.List(str)}

    def fit(self, data: dict, parameters: dict = None):
        self['numeric_columns'] = list(
            data['x'].select_dtypes(include=[np.number]).columns)

    def transform(self, data: dict):
        data['x_categorical'] = data['x'].drop(self['numeric_columns'], axis=1)
        data['x'] = data['x'].loc[:, self['numeric_columns']]
        return data
Esempio n. 3
0
class LogLassoModel(Pipe):
    transform_requires = {'x': sf_types.PandasDataFrame(schema={})}
    fit_requires = {
        'x': sf_types.PandasDataFrame(schema={}),
        'y': sf_types.Array(float)
    }
    transform_modifies = {
        'y_pred': sf_types.Array(np.float64),
        'x': sf_ops.Drop()
    }

    fitted_parameters = {'model': LassoCV}

    def fit(self, data: dict, parameters: dict = None):
        self['model'] = LassoCV(normalize=True)
        self['model'].fit(data['x'], np.log(data['y']))

    def transform(self, data: dict):
        data['y_pred'] = np.exp(self['model'].predict(data['x']))
        del data['x']
        return data
Esempio n. 4
0
class Pipe2(Pipe):
    transform_requires = {
        'x': types.PandasDataFrame(schema={
            'a': np.float64,
            'b': np.float64
        }),
    }

    transform_modifies = {'x': ops.ModifyDataFrame({'a': ops.Drop()})}

    def transform(self, data: dict):
        data['x'] = data['x'].drop('a', axis=1)
        return data
Esempio n. 5
0
class FillNaN(Pipe):
    fit_requires = transform_modifies = transform_requires = {
        'x': sf_types.PandasDataFrame(schema={}),
        'x_categorical': sf_types.PandasDataFrame(schema={})
    }

    fitted_parameters = {
        'means': sf_types.List(float),
        'most_frequent': sf_types.List(str)
    }

    def fit(self, data: dict, parameters: dict = None):
        self['means'] = data['x'].mean(axis=0)
        self['most_frequent'] = data['x_categorical'].mode(axis=0)

    def transform(self, data: dict):
        data['x'] = data['x'].fillna(self['means'])
        for column in data['x_categorical'].columns:
            data['x_categorical'].loc[
                data['x_categorical'][column].isnull(),
                column] = self['most_frequent'][column][0]
        return data
Esempio n. 6
0
class BaselineModel(Pipe):
    fit_requires = transform_requires = {'x': sf_types.PandasDataFrame({})}

    transform_modifies = {'y_pred_baseline': sf_types.Array(np.float64)}

    fitted_parameters = {'mean': np.float64}

    def fit(self, data: dict, parameters: dict = None):
        self['mean'] = np.mean(data['y'])

    def transform(self, data: dict):
        data['y_pred_baseline'] = np.full(data['x'].shape[0], self['mean'])
        return data
Esempio n. 7
0
    def test_combine(self):
        p = Pipeline([Pipe1(), Pipe2()])

        result = p.transform({'x': pd.DataFrame({'a': [2.0], 'b': [2.0]})})
        self.assertEqual(len(result['x'].columns), 2)

        self.assertEqual(p.transform_modifies, {
            'x':
            [Pipe1.transform_modifies['x'], Pipe2.transform_modifies['x']]
        })

        schema = p.transform_schema(
            {'x': types.PandasDataFrame({
                'a': np.float64,
                'b': np.float64
            })})

        self.assertEqual(
            schema['x'],
            types.PandasDataFrame({
                'b': np.float64,
                'a * b': np.float64
            }))
Esempio n. 8
0
class Pipe1(Pipe):
    transform_requires = {
        'x': types.PandasDataFrame(schema={
            'a': np.float64,
            'b': np.float64
        }),
    }

    transform_modifies = {
        'x': ops.ModifyDataFrame({'a * b': ops.Set(np.float64)})
    }

    def transform(self, data: dict):
        data['x']['a * b'] = data['x']['a'] * data['x']['b']
        return data
Esempio n. 9
0
    import logging
    import sys

    logger = logging.getLogger('schemaflow')
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    ch.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(ch)

    # this pipeline is very generic: it does not make any assumptions about the data's format.
    predict_pipeline.check_fit(
        {
            'x': sf_types.PandasDataFrame({}),
            'y': sf_types.Array(np.float64)
        },
        raise_=True)
    predict_pipeline.check_transform({'x': sf_types.PandasDataFrame({})},
                                     raise_=True)

    print('expected fit schema: ', predict_pipeline.fit_requires)
    print('fitted parameters: ', predict_pipeline.fitted_parameters)

    print('expected transform schema: ', predict_pipeline.transform_requires)
    print(
        'expected transformed schema: ',
        predict_pipeline.transform_schema(predict_pipeline.transform_requires))

    # execution of the pipeline