house_train1 = utils.drop_features(house_train, features_to_drop)
house_train1.info()

#build pipeline for categorical features
categorical_pipeline = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy="most_frequent")),
    ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

#build pipeline for numerical features
numerical_pipeline = pipeline.Pipeline([('imputer', impute.SimpleImputer()),
                                        ('scaler',
                                         preprocessing.StandardScaler())])

#build preprocessing pipeline for all features
cat_features = utils.get_non_continuous_features(house_train1)
num_features = utils.get_continuous_features(house_train1)

preprocess_pipeline = compose.ColumnTransformer([
    ('cat', categorical_pipeline, cat_features),
    ('num', numerical_pipeline, num_features)
])

#build complete pipeline with feature selection and ml algorithms
complete_pipeline = pipeline.Pipeline([
    ('preprocess', preprocess_pipeline),
    ('zv_filter', feature_selection.VarianceThreshold()),
    ('feature_selector',
     feature_selection.SelectFromModel(linear_model.Lasso())),
    ('pca', decomposition.PCA()),
    ('regressor', neighbors.KNeighborsRegressor())
Beispiel #2
0
path = 'E://'
house_train = pd.read_csv(os.path.join(path, "house_train.csv"))
house_train.shape
house_train.info()

house_test = pd.read_csv(os.path.join(path, "house_test.csv"))
house_test.shape
house_test.info()

house = pd.concat((house_train, house_test), axis=0)
house.shape
house.info()

print(utils.get_continuous_features(house))
print(utils.get_non_continuous_features(house))

sns.countplot(x='YearBuilt', data=house_train)
sns.jointplot(x="SalePrice", y="YearBuilt", data=house_train)
sns.FacetGrid(house_train, hue="YearBuilt",
              size=8).map(sns.kdeplot, "SalePrice").add_legend()

sns.countplot(x='YrSold', data=house_train)
sns.jointplot(x="SalePrice", y="YrSold", data=house_train)
sns.FacetGrid(house_train, hue="YrSold", size=8).map(sns.kdeplot,
                                                     "SalePrice").add_legend()

features_to_cast = ['MSSubClass']
utils.cast_to_cat(house, features_to_cast)

features_to_drop = ['Id']