def encodeData(self): X = self.trainingData # Remove rows with missing target, separate target from predictors X.dropna(axis=0, subset=['SalePrice'], inplace=True) #sale price y = X.SalePrice X.drop(['SalePrice'], axis=1, inplace=True) object_cols = [ 'MSSubClass', 'MSZoning', 'Alley', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType', 'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition' ] XtrainDummies = pd.get_dummies(X[object_cols]) XtrainFinal = pd.concat([X, XtrainDummies], axis='columns') for i in object_cols: XtrainFinal = XtrainFinal.drop([i], axis='columns') # X_train, X_valid, y_train, y_valid = train_test_split(X, y, # train_size=0.8, test_size=0.2, # random_state=0) # # All categorical columns # object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]; # # Get number of unique entries in each column with categorical data # object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols)) # d = dict(zip(object_cols, object_nunique)); # # Print number of unique entries by column, in ascending order # print(sorted(d.items(), key=lambda x: x[1])) # # Columns that will be one-hot encode # low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10] # # Columns that will be dropped from the dataset # high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols)) # from sklearn.preprocessing import OneHotEncoder # # Use as many lines of code as you need! # # Apply one-hot encoder to each column with categorical data # OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) # OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols])) # OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols])) # # One-hot encoding removed index; put it back # OH_cols_train.index = X_train.index # OH_cols_valid.index = X_valid.index # # # Remove categorical columns (will replace with one-hot encoding) # num_X_train = X_train.drop(object_cols, axis=1) # num_X_valid = X_valid.drop(object_cols, axis=1) # # # Add one-hot encoded columns to numerical features # OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1) # Your code here # OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1) # Your code here return DataObject(self.trainingData, self.testingData, self.combinedData)
def fillMissingData(self): labelsToFillWithNA = [ 'Alley', 'Fence', 'MiscFeature', 'PoolQC', 'FireplaceQu' ] for dataset in self.combinedData: Utils.printDatasetNulls(dataset) # Handle missing values dataset = fillMSZoningMissingValues(dataset) dataset = fillLotFrontageMissingValues(dataset) dataset = fillMasonryVeneerMissingValues(dataset) dataset = fillExteriorCoveringMissingValues(dataset) dataset = fillBasementFeaturesMissingValues(dataset) dataset = fillElectricalMissingValues(dataset) dataset = fillKitchenQualityMissingValues(dataset) dataset = fillGarageFeaturesMissingValues(dataset) dataset = fillPoolQualityMissingValues(dataset) dataset = fillSaleTypeMissingValues(dataset) # Handle NULL values dataset = Utils.fillNullLabels(dataset, labelsToFillWithNA, 'NA') dataset = Utils.fillNullLabels( dataset, ['Functional'], 'Typ') # data_description.txt tells us to assume typical 'typ' Utils.printDatasetNulls(dataset) return DataObject(self.trainingData, self.testingData, self.combinedData)
def go(self): self.trainingData.drop("Id", axis = 1, inplace = True) self.testingData.drop("Id", axis = 1, inplace = True) self.trainingData.rename(columns={'3SsnPorch':'TSsnPorch'}, inplace=True) self.testingData.rename(columns={'3SsnPorch':'TSsnPorch'}, inplace=True) self.testingData['SalePrice'] = 0 self.trainingData = self.trainingData.drop(self.trainingData[self.trainingData.SalePrice < 300000].index) self.trainingData = self.trainingData.drop(self.trainingData[(self.trainingData.GrLivArea > 4000)].index) self.trainingData = self.trainingData[self.trainingData.GarageArea * self.trainingData.GarageCars < 3700] self.trainingData = self.trainingData[self.trainingData.GrLivArea * self.trainingData.TotRmsAbvGrd < 45000] self.trainingData = self.trainingData[(self.trainingData.FullBath + (self.trainingData.HalfBath*0.5) + self.trainingData.BsmtFullBath + (self.trainingData.BsmtHalfBath*0.5)) < 5] # self.trainingData = self.trainingData.loc[~(self.trainingData.SalePrice==392500.0)] # self.trainingData = self.trainingData.loc[~((self.trainingData.SalePrice==275000.0) & (self.trainingData.Neighborhood=='Crawfor'))] # self.trainingData.SalePrice = np.log1p(self.trainingData.SalePrice) # all_data = pd.concat((self.trainingData, self.testingData)).reset_index(drop=True) # self.trainingData = self.fill(self.trainingData) # self.testingData = self.fill(self.trainingData) self.trainingData = self.fillMissingData(self.trainingData) self.testingData = self.fillMissingData(self.testingData) self.combinedData = [self.trainingData, self.testingData] return DataObject(self.trainingData, self.testingData, self.combinedData)
def process(self, test_ID): dataObject = DataObject(self.trainingData, self.testingData, self.combinedData) prelim = PreliminaryDataAdjuster(dataObject) dataObject = prelim.go() converter = OrdinalToNumericalConverter(dataObject) dataObject = converter.go() creator = FeatureEngineer(dataObject) dataObject, dataObject.combinedData, y_train, cols, colsP = creator.go( ) step7 = SelectFeatures(dataObject) dataObject, totalCols, RFEcv, XGBestCols = step7.go( dataObject.combinedData, cols, colsP) step9 = Modeling(dataObject) ouput_ensembled = step9.go(all_data, totalCols, test_ID, colsP, RFEcv, XGBestCols) ouput_ensembled.to_csv('SalePrice_N_submission.csv', index=False) print(dataObject.trainingData) return dataObject
def go(self): self.trainingData = self.mapCategoricalToOrdinal(self.trainingData) self.testingData = self.mapCategoricalToOrdinal(self.testingData) self.combinedData = [self.trainingData, self.testingData] return DataObject(self.trainingData, self.testingData, self.combinedData)
def process(self, test_ID): dataObject = DataObject(self.trainingData, self.testingData, self.combinedData) PDA = PreliminaryDataAdjuster(dataObject) dataObject = PDA.go() ONC = OrdinalToNumericalConverter(dataObject) dataObject = ONC.go() FE = FeatureEngineering(dataObject) dataObject, combinedData, y_train, cols, colsP = FE.go() SF = SelectFeatures(dataObject) dataObject, totalCols, RFEcv, XGBestCols = SF.go( combinedData, cols, colsP) model = Modeling(dataObject) ouput_ensembled = model.go(combinedData, totalCols, test_ID, colsP, RFEcv, XGBestCols) ouput_ensembled.to_csv('SalePrice_N_submission.csv', index=False) print(dataObject.trainingData) return dataObject
def go(self): trainShape = self.trainingData.shape[0] combinedData = pd.concat( (self.trainingData, self.testingData)).reset_index(drop=True) combinedData, y_train, cols, colsP = self.featureEngineer( combinedData, trainShape) return DataObject( self.trainingData, self.testingData, self.combinedData), combinedData, y_train, cols, colsP
def filterData(self): # Dropping ID column as it is not needed self.trainingData = self.trainingData.drop(['Id'], axis=1) self.testingData = self.testingData.drop(['Id'], axis=1) # Handle the removing of labels before we start the rest of the preprocessing steps labelsToRemove = ['Utilities'] self.trainingData = self.trainingData.drop(labelsToRemove, axis=1) self.testingData = self.testingData.drop(labelsToRemove, axis=1) self.combinedData = [self.trainingData, self.testingData] # isolateOutliers(self.trainingData) return DataObject(self.trainingData, self.testingData, self.combinedData)
def convertData(self): self.trainingData = self.mapCategoricalToOrdinal(self.trainingData) self.testingData = self.mapCategoricalToOrdinal(self.testingData) self.trainingData = self.changeYearsToAge(self.trainingData) self.testingData = self.changeYearsToAge(self.testingData) self.trainingData = self.addRemodAndConvertAge(self.trainingData) self.testingData = self.addRemodAndConvertAge(self.testingData) self.trainingData = self.defineUint8Types(self.trainingData) self.testingData = self.defineUint8Types(self.testingData) self.combinedData = [self.trainingData, self.testingData] return DataObject(self.trainingData, self.testingData, self.combinedData)
def go(self): self.trainingData.drop("Id", axis=1, inplace=True) self.testingData.drop("Id", axis=1, inplace=True) self.trainingData.rename(columns={'3SsnPorch': 'TSsnPorch'}, inplace=True) self.testingData.rename(columns={'3SsnPorch': 'TSsnPorch'}, inplace=True) self.testingData['SalePrice'] = 0 self.trainingData = self.trainingData.drop( self.trainingData[(self.trainingData.GrLivArea > 4000) & (self.trainingData.SalePrice < 300000)].index) self.trainingData = self.trainingData[ self.trainingData.GrLivArea * self.trainingData.TotRmsAbvGrd < 45000] self.trainingData = self.trainingData[ self.trainingData.GarageArea * self.trainingData.GarageCars < 3700] self.trainingData = self.trainingData[( self.trainingData.FullBath + (self.trainingData.HalfBath * 0.5) + self.trainingData.BsmtFullBath + (self.trainingData.BsmtHalfBath * 0.5)) < 5] self.trainingData = self.trainingData.loc[~( self.trainingData.SalePrice == 392500.0)] self.trainingData = self.trainingData.loc[~( (self.trainingData.SalePrice == 275000.0) & (self.trainingData.Neighborhood == 'Crawfor'))] self.trainingData.SalePrice = np.log1p(self.trainingData.SalePrice) CDQ = CheckDataQuality(self.trainingData, self.testingData) self.trainingData, self.testingData = CDQ.go() self.combinedData = [self.trainingData, self.testingData] return DataObject(self.trainingData, self.testingData, self.combinedData)
def go(self): self.trainingData = self.changeYearsToAge(self.trainingData) self.testingData = self.changeYearsToAge(self.testingData) self.trainingData = self.addRemodAndConvertAge(self.trainingData) self.testingData = self.addRemodAndConvertAge(self.testingData) self.trainingData = self.defineUint8Types(self.trainingData) self.testingData = self.defineUint8Types(self.testingData) ntrain = self.trainingData.shape[0] #self.trainingData = self.featureEngineer(self.trainingData, ntrain) #self.testingData = self.featureEngineer(self.testingData, ntrain) #all_data = pd.concat((self.trainingData, self.testingData)).reset_index(drop=True) self.trainingData, y_train, cols, colsP = self.featureEngineer( self.trainingData, ntrain) self.testingData, y_train, cols, colsP = self.featureEngineer( self.testingData, ntrain) self.combinedData = [self.trainingData, self.testingData] # self.combinedData, y_train, cols, colsP = self.featureEngineer(self.combinedData, ntrain) return DataObject( self.trainingData, self.testingData, self.combinedData), self.combinedData, y_train, cols, colsP
def correlateData(self): # correlatedData = self.performCorrelation(self.trainingData) # self.trainingData = filterByCorrelation(self.trainingData, correlatedData) # self.trainingData = self.filterByCorrelation(self.trainingData) return DataObject(self.trainingData, self.testingData, self.combinedData)
def go(self, all_data, cols, polynomialColumns): trainingData = all_data.loc[(all_data.SalePrice > 0), cols].reset_index(drop=True, inplace=False) y_train = all_data.SalePrice[all_data.SalePrice > 0].reset_index( drop=True, inplace=False) robustScaler = RobustScaler() robustScalerDataFrame = pd.DataFrame(robustScaler.fit_transform( trainingData[cols]), columns=cols) pValueColumns = cols.values pValueColumns = self.backwardElimination(robustScalerDataFrame, y_train, pValueColumns) lasso = Lasso(alpha=0.0005, tol=0.002) recursiveFeatureEliminator = RFECV(estimator=lasso, n_jobs=-1, step=1, scoring='neg_mean_squared_error', cv=5) recursiveFeatureEliminator.fit(robustScalerDataFrame, y_train) recursivelySelectedFeatures = recursiveFeatureEliminator.get_support() recursiveFeatureSelectedColumns = cols[recursivelySelectedFeatures] r2Score = r2_score lasso = Lasso(alpha=0.0005, tol=0.002) sequentialFeatureSelection = SequentialFeatureSelection( lasso, k_features=1, scoring=r2Score) sequentialFeatureSelection.fit(robustScalerDataFrame, y_train) sequentialFeatureSelectionScoreLength = len( sequentialFeatureSelection.scores_) sequentialFeatureSelectionScoreCriteria = ( sequentialFeatureSelection.scores_ == max( sequentialFeatureSelection.scores_)) arrangedSequentialFeatures = np.arange( 0, sequentialFeatureSelectionScoreLength )[sequentialFeatureSelectionScoreCriteria] maxSequentialFeatureScore = max(arrangedSequentialFeatures) sequentialFeatureSelectionSubsets = list( sequentialFeatureSelection.subsets_[maxSequentialFeatureScore]) sequentialBackwardSelection = list( robustScalerDataFrame.columns[sequentialFeatureSelectionSubsets]) kBestSelection = SelectKBest(score_func=f_regression, k=kBestValue) kBestSelection.fit(robustScalerDataFrame, y_train) select_features_kbest = kBestSelection.get_support() kbestWithFRegressionScoringFunction = cols[select_features_kbest] kBestSelection = SelectKBest(score_func=mutual_info_regression, k=kBestValue) kBestSelection.fit(robustScalerDataFrame, y_train) select_features_kbest = kBestSelection.get_support() kbestWithMutualInfoRegressionScoringFunction = cols[ select_features_kbest] X_train, X_test, y, y_test = train_test_split( robustScalerDataFrame, y_train, test_size=0.30, random_state=randomStateValue) model = XGBRegressor(base_score=0.5, random_state=randomStateValue, n_jobs=4, silent=True) model.fit(X_train, y) bestValue = 1e36 bestColumns = 31 my_model = model threshold = 0 for modelThreshold in np.sort(np.unique(model.feature_importances_)): selectionsFromModel = SelectFromModel(model, threshold=modelThreshold, prefit=True) X_trainSelectedFromModel = selectionsFromModel.transform(X_train) modelForSelection = XGBRegressor(base_score=0.5, random_state=randomStateValue, n_jobs=4, silent=True) modelForSelection.fit(X_trainSelectedFromModel, y) X_testSelectedFromModel = selectionsFromModel.transform(X_test) y_pred = modelForSelection.predict(X_testSelectedFromModel) roundedPredictions = [ round(predictedValue) for predictedValue in y_pred ] meanSquaredErrorValue = mean_squared_error(y_test, roundedPredictions) if (bestValue >= meanSquaredErrorValue): bestValue = meanSquaredErrorValue bestColumns = X_trainSelectedFromModel.shape[1] my_model = modelForSelection threshold = modelThreshold listOfFeatureImportance = [ (score, feature) for score, feature in zip(model.feature_importances_, cols) ] XGBestValues = pd.DataFrame(sorted( sorted(listOfFeatureImportance, reverse=True)[:bestColumns]), columns=['Score', 'Feature']) XGBestColumns = XGBestValues.iloc[:, 1].tolist() unionSetOfBestColumns = set(pValueColumns) unionSetOfBestColumns = unionSetOfBestColumns.union( set(recursiveFeatureSelectedColumns)) unionSetOfBestColumns = unionSetOfBestColumns.union( set(kbestWithFRegressionScoringFunction)) unionSetOfBestColumns = unionSetOfBestColumns.union( set(kbestWithMutualInfoRegressionScoringFunction)) unionSetOfBestColumns = unionSetOfBestColumns.union(set(XGBestColumns)) unionSetOfBestColumns = unionSetOfBestColumns.union( set(sequentialBackwardSelection)) unionSetOfBestColumns = unionSetOfBestColumns.union( set(polynomialColumns)) unionSetOfBestColumnsList = list(unionSetOfBestColumns) return DataObject( self.trainingData, self.testingData, self.combinedData ), unionSetOfBestColumnsList, recursiveFeatureSelectedColumns, XGBestColumns
def go(self, all_data, cols, colsP): train = all_data.loc[(all_data.SalePrice > 0), cols].reset_index(drop=True, inplace=False) y_train = all_data.SalePrice[all_data.SalePrice > 0].reset_index( drop=True, inplace=False) test = all_data.loc[(all_data.SalePrice == 0), cols].reset_index(drop=True, inplace=False) # Main script here scale = RobustScaler() df = pd.DataFrame(scale.fit_transform(train[cols]), columns=cols) #select features based on P values ln_model = sm.OLS(y_train, df) result = ln_model.fit() print(result.summary2()) pv_cols = cols.values SL = 0.051 pv_cols, LR = self.backwardElimination(df, y_train, SL, pv_cols) pred = LR.predict(df[pv_cols]) y_pred = pred.apply(lambda x: 1 if x > 0.5 else 0) print('Fvalue: {:.6f}'.format(LR.fvalue)) print('MSE total on the train data: {:.4f}'.format(LR.mse_total)) ls = Lasso(alpha=0.0005, max_iter=161, selection='cyclic', tol=0.002, random_state=101) rfecv = RFECV(estimator=ls, n_jobs=-1, step=1, scoring='neg_mean_squared_error', cv=5) rfecv.fit(df, y_train) select_features_rfecv = rfecv.get_support() RFEcv = cols[select_features_rfecv] print('{:d} Features Select by RFEcv:\n{:}'.format( rfecv.n_features_, RFEcv.values)) score = r2_score ls = Lasso(alpha=0.0005, max_iter=161, selection='cyclic', tol=0.002, random_state=101) sbs = SequentialFeatureSelection(ls, k_features=1, scoring=score) sbs.fit(df, y_train) print('Best Score: {:2.2%}\n'.format(max(sbs.scores_))) print('Best score with:{0:2d}.\n'.\ format(len(list(df.columns[sbs.subsets_[np.argmax(sbs.scores_)]])))) SBS = list(df.columns[list(sbs.subsets_[max( np.arange(0, len(sbs.scores_))[(sbs.scores_ == max(sbs.scores_))])])]) print('\nBest score with {0:2d} features:\n{1:}'.format(len(SBS), SBS)) skb = SelectKBest(score_func=f_regression, k=80) skb.fit(df, y_train) select_features_kbest = skb.get_support() kbest_FR = cols[select_features_kbest] scores = skb.scores_[select_features_kbest] skb = SelectKBest(score_func=mutual_info_regression, k=80) skb.fit(df, y_train) select_features_kbest = skb.get_support() kbest_MIR = cols[select_features_kbest] scores = skb.scores_[select_features_kbest] X_train, X_test, y, y_test = train_test_split(df, y_train, test_size=0.30, random_state=101) # fit model on all training data #importance_type='gain' model = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, max_delta_step=0, random_state=101, min_child_weight=1, missing=None, n_jobs=4, scale_pos_weight=1, seed=None, silent=True, subsample=1) model.fit(X_train, y) # Using each unique importance as a threshold thresholds = np.sort(np.unique(model.feature_importances_)) best = 1e36 colsbest = 31 my_model = model threshold = 0 for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) # train model selection_model = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, max_delta_step=0, random_state=101, min_child_weight=1, missing=None, n_jobs=4, scale_pos_weight=1, seed=None, silent=True, subsample=1) selection_model.fit(select_X_train, y) # eval model select_X_test = selection.transform(X_test) y_pred = selection_model.predict(select_X_test) predictions = [round(value) for value in y_pred] r2 = r2_score(y_test, predictions) mse = mean_squared_error(y_test, predictions) print( "Thresh={:1.3f}, n={:d}, R2: {:2.2%} with MSE: {:.4f}".format( thresh, select_X_train.shape[1], r2, mse)) if (best >= mse): best = mse colsbest = select_X_train.shape[1] my_model = selection_model threshold = thresh feature_importances = [ (score, feature) for score, feature in zip(model.feature_importances_, cols) ] XGBest = pd.DataFrame(sorted( sorted(feature_importances, reverse=True)[:colsbest]), columns=['Score', 'Feature']) XGBestCols = XGBest.iloc[:, 1].tolist() bcols = set(pv_cols).union(set(RFEcv)).union(set(kbest_FR)).union( set(kbest_MIR)).union(set(XGBestCols)).union(set(SBS)) intersection = set(SBS).intersection(set(kbest_MIR)).intersection( set(RFEcv)).intersection(set(pv_cols)).intersection( set(kbest_FR)).intersection(set(XGBestCols)) print(intersection, '\n') print('_' * 75, '\nUnion All Features Selected:') print('Total number of features selected:', len(bcols)) print('\n{0:2d} features removed if use the union of selections: {1:}'. format(len(cols.difference(bcols)), cols.difference(bcols))) totalCols = list(bcols.union(set(colsP))) #self.trainingData = self.trainingData.loc[list(totalCols)].reset_index(drop=True, inplace=False) #self.testingData = self.testingData.loc[list(totalCols)].reset_index(drop=True, inplace=False) #self.combinedData = [self.trainingData, self.testingData] return DataObject(self.trainingData, self.testingData, self.combinedData), totalCols, RFEcv, XGBestCols
def fill(self, data): data.drop('Utilities', axis=1, inplace=True) data.Electrical = data.Electrical.fillna('SBrkr') data.GarageType = data.GarageType.fillna('NA') data.loc[(data.MasVnrType == 'None') & (data.MasVnrArea > 0), ['MasVnrType']] = 'BrkFace' data.loc[(data.MasVnrType.isnull()) & (data.MasVnrArea > 0), ['MasVnrType']] = 'BrkFace' data.loc[(data.MasVnrType != 'None') & (data.MasVnrArea == 0), ['MasVnrArea']] = \ data.loc[(data.MasVnrType != 'None') & (data.MasVnrArea > 0), ['MasVnrArea']].median()[0] data.MasVnrArea = data.MasVnrArea.fillna(0) data.MasVnrType = data.MasVnrType.fillna('None') mode1 = data[data.GarageType == 'Detchd'].GarageFinish.mode()[0] mode2 = data[data.GarageType == 'Detchd'].GarageQual.mode()[0] mode3 = data[data.GarageType == 'Detchd'].GarageCond.mode()[0] median1 = data[data.GarageType == 'Detchd'].GarageArea.median() median2 = data[data.GarageType == 'Detchd'].GarageCars.median() median3 = data[data.GarageType == 'Detchd'].GarageYrBlt.median() data.loc[data.GarageType == 'Detchd', 'GarageFinish'] = data.loc[ data.GarageType == 'Detchd', 'GarageFinish'].fillna(mode1) data.GarageFinish = data.GarageFinish.fillna('NA') data.loc[data.GarageType == 'Detchd', 'GarageQual'] = data.loc[ data.GarageType == 'Detchd', 'GarageQual'].fillna(mode2) data.GarageQual = data.GarageQual.fillna('NA') data.loc[data.GarageType == 'Detchd', 'GarageCond'] = data.loc[ data.GarageType == 'Detchd', 'GarageCond'].fillna(mode3) data.GarageCond = data.GarageCond.fillna('NA') data.loc[data.GarageType == 'Detchd', 'GarageArea'] = data.loc[ data.GarageType == 'Detchd', 'GarageArea'].fillna(median1) data.GarageArea = data.GarageArea.fillna(0) data.loc[data.GarageType == 'Detchd', 'GarageCars'] = data.loc[ data.GarageType == 'Detchd', 'GarageCars'].fillna(median2) data.GarageCars = data.GarageCars.fillna(0) data.loc[data.GarageType == 'Detchd', 'GarageYrBlt'] = data.loc[ data.GarageType == 'Detchd', 'GarageYrBlt'].fillna(median3) data.GarageYrBlt = data.GarageYrBlt.fillna(0) data.loc[(~data.TotalBsmtSF.isnull()) & (data.BsmtExposure.isnull()) & ( data.TotalBsmtSF > 0), 'BsmtExposure'] = 'Av' data.loc[(~data.TotalBsmtSF.isnull()) & (data.BsmtQual.isnull()) & ( data.TotalBsmtSF > 0), 'BsmtQual'] = 'TA' data.loc[(~data.TotalBsmtSF.isnull()) & (data.BsmtCond.isnull()) & ( data.TotalBsmtSF > 0), 'BsmtCond'] = 'TA' data.loc[(data.BsmtFinSF2 > 0) & (data.BsmtFinType2.isnull()), 'BsmtFinType2'] = 'Unf' data.loc[(data.BsmtFinSF2 == 0) & (data.BsmtFinType2 != 'Unf') & ( ~data.BsmtFinType2.isnull()), 'BsmtFinSF2'] = 354.0 data.loc[(data.BsmtFinSF2 == 0) & (data.BsmtFinType2 != 'Unf') & ( ~data.BsmtFinType2.isnull()), 'BsmtUnfSF'] = 0.0 nulls_cols = {'BsmtExposure': 'NA', 'BsmtFinType2': 'NA', 'BsmtQual': 'NA', 'BsmtCond': 'NA', 'BsmtFinType1': 'NA', 'BsmtFinSF1': 0, 'BsmtFinSF2': 0, 'BsmtUnfSF': 0, 'TotalBsmtSF': 0, 'BsmtFullBath': 0, 'BsmtHalfBath': 0} data = data.fillna(value=nulls_cols) NegMean = data.groupby('Neighborhood').LotFrontage.mean() data.loc.LotFrontage = data[['Neighborhood', 'LotFrontage']].apply( lambda x: NegMean[x.Neighborhood] if np.isnan(x.LotFrontage) else x.LotFrontage, axis=1) PoolQC = {0: 'NA', 1: 'Po', 2: 'Fa', 3: 'TA', 4: 'Gd', 5: 'Ex'} data.loc[(data.PoolArea > 0) & (data.PoolQC.isnull()), ['PoolQC']] = \ ((data.loc[(data.PoolArea > 0) & (data.PoolQC.isnull()), ['OverallQual']] / 2).round()). \ apply(lambda x: x.map(PoolQC)) data.PoolQC = data.PoolQC.fillna('NA') data.Functional = data.Functional.fillna('Typ') data.loc[(data.Fireplaces == 0) & (data.FireplaceQu.isnull()), ['FireplaceQu']] = 'NA' data.loc[(data.KitchenAbvGr > 0) & (data.KitchenQual.isnull()), ['KitchenQual']] = data.KitchenQual.mode()[0] data.Alley = data.Alley.fillna('NA') data.Fence = data.Fence.fillna('NA') data.MiscFeature = data.MiscFeature.fillna('NA') data.loc[data.GarageYrBlt == 2207.0, 'GarageYrBlt'] = 2007.0 data = DT().fit_transform(data) self.trainingData = data.loc[(data.SalePrice > 0)].reset_index(drop=True, inplace=False) self.testingData = data.loc[(data.SalePrice == 0)].reset_index(drop=True, inplace=False) data = [self.trainingData, self.testingData] return DataObject(self.trainingData, self.testingData, data)