Esempio n. 1
0
house_test = pd.read_csv(os.path.join(path, "test.csv"))
house_test.shape
house_test.info()

house = pd.concat((house_train, house_test), axis=0)
house.shape
house.info()

features_to_cast = ['MSSubClass']
cast_cont_to_cat(house, features_to_cast)

print(get_continuous_features(house))
print(get_categorical_features(house))

features_to_drop = ['Id', 'SalePrice']
features_to_drop.extend(get_features_to_drop_on_missingdata(house, 0.25))
house1 = drop_features(house, features_to_drop)
house1.info()

imputable_cat_features = get_categorical_features(house1)
cat_imputer = get_categorical_imputers(house1, imputable_cat_features)
house1[imputable_cat_features] = cat_imputer.transform(
    house1[imputable_cat_features])

imputable_cont_features = get_continuous_features(house1)
cont_imputer = get_continuous_imputers(house1, imputable_cont_features)
house1[imputable_cont_features] = cont_imputer.transform(
    house1[imputable_cont_features])
house1.info()

house2 = ohe(house1, imputable_cat_features)
def rmse(y_orig, y_pred):
    return math.sqrt(metrics.mean_squared_error(y_orig, y_pred))


path = 'G://house-prices'
house_train = pd.read_csv(os.path.join(path, "train.csv"))
house_train.shape
house_train.info()

#type cast features
features_to_cast = ['MSSubClass']
utils.cast_to_cat(house_train, features_to_cast)

#manual feature selection
features_to_drop = ['Id', 'SalePrice']
missing_features_above_th = utils.get_features_to_drop_on_missingdata(
    house_train, 0.25)
features_to_drop.extend(missing_features_above_th)
house_train1 = utils.drop_features(house_train, features_to_drop)
house_train1.info()

#build pipeline for categorical features
categorical_pipeline = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy="most_frequent")),
    ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

#build pipeline for numerical features
numerical_pipeline = pipeline.Pipeline([('imputer', impute.SimpleImputer()),
                                        ('scaler',
                                         preprocessing.StandardScaler())])