def __init__(self): self.catPreprocessor = make_pipeline(e.CategoricalGrouper(), e.CategoricalEncoder()) self.catPCA = make_pipeline(e.CategoricalGrouper(), e.CategoricalEncoder(), PCA(n_components=6, random_state=0)) self.contPreprocessor = make_pipeline( QuantileTransformer(output_distribution='normal', random_state=0), StandardScaler()) self.contPCA = make_pipeline( QuantileTransformer(output_distribution='normal', random_state=0), StandardScaler(), PCA(n_components=6, random_state=0))
LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index) LOGGER.info('Load categorical features to drop') noVarFeatures = json.load(open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r')) LOGGER.info('Process categorical features') catf = pd.DataFrame( data=make_pipeline( e.CategoricalGrouper(), e.CategoricalEncoder() ).fit_transform(X.filter(like='cat').drop(labels=noVarFeatures, axis=1), y), columns=X.filter(like='cat').drop(labels=noVarFeatures, axis=1).columns, index=X.index ) LOGGER.info('Process continuous features') contf = pd.DataFrame( data=scale(quantile_transform( X=X.filter(like='cont'), output_distribution='normal', random_state=0 )), columns=X.filter(like='cont').columns, index=X.index
LOGGER.info('Load correlated features') CORRELATED = json.load( open(file=p.joinpath('src', 'meta', 'Correlated.json'), mode='r')) LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.filter(CORRELATED) y = df['loss'].copy() LOGGER.info('Process target') y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index) LOGGER.info('Process categorical features') catf = pd.DataFrame(data=make_pipeline(e.CategoricalGrouper(), e.CategoricalEncoder()).fit_transform( X.filter(like='cat'), y), columns=X.filter(like='cat').columns, index=X.index) LOGGER.info('Process continuous features') contf = pd.DataFrame(data=scale( quantile_transform(X=X.filter(like='cont'), output_distribution='normal', random_state=0)), columns=X.filter(like='cont').columns, index=X.index) LOGGER.info(r'Figure 1: Correlations above 75%') X = catf.join(contf)
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index) LOGGER.info('Load categorical features to drop') noVarFeatures = json.load( open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r')) LOGGER.info('Process categorical features') catf = pd.DataFrame(data=make_pipeline( e.CategoricalGrouper(), e.CategoricalEncoder()).fit_transform( X.filter(like='cat').drop(labels=noVarFeatures, axis=1), y), columns=X.filter(like='cat').drop(labels=noVarFeatures, axis=1).columns, index=X.index) LOGGER.info('Process continuous features') contf = pd.DataFrame(data=scale( quantile_transform(X=X.filter(like='cont'), output_distribution='normal', random_state=0)), columns=X.filter(like='cont').columns, index=X.index) LOGGER.info('Find correlations') corr = catf.join(contf).corr()
def make_pipeline(model: Estimator) -> Pipeline: return Pipeline([('grouper', e.CategoricalGrouper()), ('encoder', e.CategoricalEncoder()), ('clf', model)])