class FeatureSelection: def __init__(self, train, test, id_column, y_column_name): self.train = train self.number_of_train = train.shape[0] self.id_column = id_column self.y_column_name = y_column_name self.test = test self.data = pd.concat([self.train, self.test], ignore_index=True) self.feature_engineering = FeatureEngineering(self.train, self.test, self.id_column, self.y_column_name) self.id_and_output = self.data[[self.id_column, self.y_column_name]] def preprocess_my_data(self): self.data = self.feature_engineering.scale_features() return self.data def perform_feature_selection(self, num_of_features_to_select): data = self.preprocess_my_data() train_data = data[:self.number_of_train] ytrain = train_data[self.y_column_name] xtrain = train_data.drop([self.id_column, self.y_column_name], axis=1) feature_sel_model = ExtraTreesClassifier().fit(xtrain, ytrain) feat_importances = pd.Series(feature_sel_model.feature_importances_, index=xtrain.columns) selected_features = feat_importances.nlargest( num_of_features_to_select) selected_features_df = selected_features.to_frame() selected_features_list = selected_features_df.index.tolist() features_df = self.data[selected_features_list] self.data = pd.concat([self.id_and_output, features_df], axis=1) return self.data
class PreprocessedData: """ This class combines the feature engineering and feature selection class to one to preprocess the data. It takes in the cleaned data and the y_column_name(ratings) """ def __init__(self, data, y_column_name): self.data = data self.y_column_name = y_column_name self.feature_engineering = FeatureEngineering(data, y_column_name) self.feature_selection = FeatureSelection(data, y_column_name) def preprocess_my_data(self, num_of_features_to_select): """ This method preprocesses the cleaned data and performs feature selection to select best n-features :param num_of_features_to_select: n-best features of the model :return: Full preprocesse data with n-selected features """ self.data = self.feature_engineering.input_rare_categorical() self.data = self.feature_engineering.encode_categorical_features() self.data = self.feature_engineering.scale_features() self.data = self.feature_selection.perform_feature_selection( num_of_features_to_select) return self.data
class ProcessedData: def __init__(self, train, test, id_column, y_column_name): self.train = train self.test = test self.y_column_name = y_column_name self.id_column = id_column self.data = pd.concat([self.train, self.test], ignore_index=True) self.feature_engineering = FeatureEngineering(self.train, self.test, self.id_column, self.y_column_name) self.feature_selection = FeatureSelection(self.train, self.test, self.id_column, self.y_column_name) def preprocess_my_data(self, num_of_features_to_select): self.data = self.feature_engineering.scale_features() self.data = self.feature_selection.perform_feature_selection( num_of_features_to_select) return self.data
class FeatureSelection: """ This class peroforms feature selection. It takes the cleaned data and the the y_column_name(ratings) """ def __init__(self, data, y_column_name): self.data = data self.y_column_name = y_column_name self.y = self.data[[self.y_column_name]] self.feature_engineering = FeatureEngineering(data, y_column_name) def preprocess_my_data(self): """ This method preprocessed the data by inputing rare categorical, perform feature engineering and feature scaling :return: preprocessed full data """ self.data = self.feature_engineering.input_rare_categorical() self.data = self.feature_engineering.encode_categorical_features() self.data = self.feature_engineering.scale_features() return self.data def perform_feature_selection(self, num_of_features_to_select): """ This method performs the feature selection technique :param num_of_features_to_select: number of best features to select :return: full data with n-selected features """ data = self.preprocess_my_data() self.train = data[0: 300000] label_encoder = LabelEncoder() ytrain = self.train[self.y_column_name] ytrain= label_encoder.fit_transform(ytrain) xtrain = self.train.drop([self.y_column_name], axis=1) feature_sel_model = ExtraTreesClassifier().fit(xtrain, ytrain) feat_importances = pd.Series(feature_sel_model.feature_importances_, index=xtrain.columns) selected_features = feat_importances.nlargest(num_of_features_to_select) selected_features_df = selected_features.to_frame() selected_features_list = selected_features_df.index.tolist() data = self.data[selected_features_list] self.data = pd.concat([self.y, data], axis=1) return self.data