house_test = pd.read_csv(os.path.join(path, "test.csv")) house_test.shape house_test.info() house = pd.concat((house_train, house_test), axis=0) house.shape house.info() features_to_cast = ['MSSubClass'] cast_cont_to_cat(house, features_to_cast) print(get_continuous_features(house)) print(get_categorical_features(house)) features_to_drop = ['Id', 'SalePrice'] features_to_drop.extend(get_features_to_drop_on_missingdata(house, 0.25)) house1 = drop_features(house, features_to_drop) house1.info() imputable_cat_features = get_categorical_features(house1) cat_imputer = get_categorical_imputers(house1, imputable_cat_features) house1[imputable_cat_features] = cat_imputer.transform( house1[imputable_cat_features]) imputable_cont_features = get_continuous_features(house1) cont_imputer = get_continuous_imputers(house1, imputable_cont_features) house1[imputable_cont_features] = cont_imputer.transform( house1[imputable_cont_features]) house1.info() house2 = ohe(house1, imputable_cat_features)
def rmse(y_orig, y_pred): return math.sqrt(metrics.mean_squared_error(y_orig, y_pred)) path = 'G://house-prices' house_train = pd.read_csv(os.path.join(path, "train.csv")) house_train.shape house_train.info() #type cast features features_to_cast = ['MSSubClass'] utils.cast_to_cat(house_train, features_to_cast) #manual feature selection features_to_drop = ['Id', 'SalePrice'] missing_features_above_th = utils.get_features_to_drop_on_missingdata( house_train, 0.25) features_to_drop.extend(missing_features_above_th) house_train1 = utils.drop_features(house_train, features_to_drop) house_train1.info() #build pipeline for categorical features categorical_pipeline = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy="most_frequent")), ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')) ]) #build pipeline for numerical features numerical_pipeline = pipeline.Pipeline([('imputer', impute.SimpleImputer()), ('scaler', preprocessing.StandardScaler())])