class DFLeaveOneOutEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = LeaveOneOutEncoder(**kwargs) self.transform_cols = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols], y) return self def transform(self, X): return self.__transform(X) def __transform(self, X, y=None): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.drop(columns=self.transform_cols) new_X = pd.concat([ new_X, self.model.transform(X[self.transform_cols]) if y is None else self.model.fit_transform(X[self.transform_cols], y) ], axis=1) return new_X def fit_transform(self, X, y): # NOTE: Result of fit_transform() is different from fit() + transform() return self.fit(X, y).__transform(X, y)
class MineFeatureManager(FeatureManager): def __init__(self, num_config=None, categorical_config=None): self.num_features = [ 'power', 'kilometer', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_11', 'v_12', 'v_13', 'v_14', 'carAge', 'v_10_1', 'v_10_2', 'v_10_3', 'nameEncode', 'modelEncode', 'regionCodeEncode', 'gearbox', 'notRepairedDamage', 'seller', 'offerType' ] self.categorical_features = ['model', 'brand', 'bodyType', 'fuelType', 'createMon'] self.encoded_cates = ['name', 'model', 'regionCode'] self.cate_encoder = LeaveOneOutEncoder(cols=self.encoded_cates) self.general_model = None super().__init__(self.num_features, self.categorical_features, num_config, categorical_config) def _feature_engien(self, features): zero_na = {0: np.nan} features = features.replace({'power': zero_na, 'v_5': zero_na, 'v_6': zero_na}) features['carAge'] = (features['creatDate'] - features['regDate']).apply(lambda x: x.days) features['createMon'] = features['creatDate'].dt.month features['notRepairedDamage'] = features['notRepairedDamage'].replace('-', np.nan).astype(float) features.loc[features['power'] > 600, 'power'] = np.nan features['power'] = np.log(features['power']) features.loc[features['v_7'] > 0.5, 'v_7'] = np.nan features.loc[features['v_11'] > 10, 'v_11'] = np.nan features.loc[features['v_13'] > 7.5, 'v_13'] = np.nan features.loc[features['v_14'] > 7.5, 'v_14'] = np.nan features.loc[features['v_10'] <= 0, 'v_10_1'] = features.loc[features['v_10'] <= 0, 'v_10'] features.loc[(features['v_10'] >= 0) & (features['v_10'] < 6), 'v_10_2'] = features.loc[(features['v_10'] >= 0) & (features['v_10'] < 6), 'v_10'] features.loc[features['v_10'] > 8, 'v_10_3'] = features.loc[features['v_10'] > 8, 'v_10'] features.loc[~features['model'].isin(self.general_model), 'model'] = np.nan return features def get_model_features(self, features): features = features.copy() self.general_model = df.model.value_counts()[df.model.value_counts() < 2000].index encoded_cate = self.cate_encoder.fit_transform(features[self.encoded_cates], features['logPrice']) for cate in self.encoded_cates: features[cate + 'Encode'] = encoded_cate[cate] features = self._feature_engien(features) return super().get_model_features(features) def transform_feature(self, features): features = features.copy() encoded_cate = self.cate_encoder.transform(features[self.encoded_cates]) for cate in self.encoded_cates: features[cate + 'Encode'] = encoded_cate[cate] features = self._feature_engien(features) return super().get_model_features(features)
class LeaveOneOutEncoder(): """Maps each categorical value to one column using LeaveOneOut encoding. Parameters: cols: [str] list of column names to encode. """ name = 'leave_one_out' def __init__(self, cols=None): self.encoder = LeaveOneOut(cols=cols) def fit(self, X, features, y): """Fits encoder to data table. returns self """ self.encoder.fit(X, y) self.features = self.encode_features_list(X, features) return self def transform(self, X): """Encodes matrix and updates features accordingly. returns encoded matrix (dataframe) """ X_new = self.encoder.transform(X) X_new.columns = self._rename_columns(self.features) return X_new def fit_transform(self, X, features, y=None): """First fits, then transforms matrix. returns encoded matrix (dataframe) """ self.encoder.fit(X, y) self.features = self.encode_features_list(X, features) X_new = self.encoder.fit_transform(X, y) X_new.columns = self._rename_columns(self.features) return X_new def get_mapping(self, category): """Gets the mapping for the LeaveOneOut encoder. Only takes strings of the column name, not the index number. returns mapping (dict) """ return self.encoder.mapping[category] def encode_features_list(self, X, features): feature_list = [] for f in features: if f.get_name() in self.encoder.cols: f = ft.Feature([f], primitive=LeaveOneOutEnc(self, f.get_name())) feature_list.append(f) return feature_list def _rename_columns(self, features): feature_names = [] for feature in features: for fname in feature.get_feature_names(): feature_names.append(fname) return feature_names def get_features(self): return self.features def get_name(self): return self.name
#Replacing null values by its respective mean value. df['HB'].fillna(df['HB'].mean(), inplace=True) df['CREATININE'].fillna(df['CREATININE'].mode()[0], inplace=True) df['UREA'].fillna(df['UREA'].mean(), inplace=True) df_drop = df.drop(['SL.', 'PAST MEDICAL HISTORY CODE'], axis=1) cat_col = df_drop.select_dtypes(exclude=np.number).columns #Leave one out encoder. le = LeaveOneOutEncoder() df_drop[cat_col] = le.fit_transform(X=df_drop[cat_col], y=df_drop['TOTAL COST TO HOSPITAL ']) #Train test split X = df_drop.drop('TOTAL COST TO HOSPITAL ', axis=1) y = df_drop['TOTAL COST TO HOSPITAL '] #Page Layout: col1 = st.sidebar col2, col3 = st.beta_columns((1, 1)) empty = pd.DataFrame(columns=X.columns) #Manual Input age_val = col1.slider('Age of the Patient', 0, 120, 30) gen_val = col1.selectbox(' Select Gender of Patient', ('Male', 'Female')) mar_val = col1.radio(' Select Marital Status', ('Married', 'Unmarried'))
#pd.get_dummies(X[["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"]]) X = X.drop(["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"], axis=1) \ .join(pd.get_dummies(X[["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"]])) from sklearn.feature_extraction import FeatureHasher #h = FeatureHasher(input_type='string', n_features=1000) #X[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].values #hash_X = h.fit_transform(X[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].values) #hash_X = pd.DataFrame(hash_X.toarray()) from category_encoders import LeaveOneOutEncoder loo_encoder = LeaveOneOutEncoder(cols=["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"]) loo_X = loo_encoder.fit_transform(X[["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"]], y) X = X.drop(["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"], axis=1).join(loo_X) X.ord_1.replace(to_replace = ['Novice', 'Contributor','Expert', 'Master', 'Grandmaster'], value = [0, 1, 2, 3, 4], inplace = True) X.ord_2.replace(to_replace = ['Freezing', 'Cold', 'Warm', 'Hot','Boiling Hot', 'Lava Hot'], value = [0, 1, 2, 3, 4, 5], inplace = True) from sklearn.preprocessing import LabelEncoder for i in ["ord_3", "ord_4"]: le = LabelEncoder() X[[i]] = le.fit_transform(X[[i]])
def DataCleaner(values_df, labels_df, test_df): # Training Set df = pd.merge(values_df, labels_df, on='id') #Fills in the mod for col in df.columns[df.isna().sum() > 0]: mode = df[col].mode()[0] df[col].fillna(value=mode, inplace=True) #dropping to_drop = [ 'funder', 'num_private', 'subvillage', 'region_code', 'recorded_by', 'source_type', 'waterpoint_type', 'scheme_name', 'payment_type', 'quantity_group' ] df.drop(columns=to_drop, inplace=True) #targets to 0,1,2 df['status_group'] = df['status_group'].map({ 'functional': 2, 'functional needs repair': 1, 'non functional': 0 }) #date column df['date_recorded'] = pd.to_datetime(df['date_recorded']) df['year_recorded'] = df['date_recorded'].dt.year df['month_recorded'] = df['date_recorded'].dt.month df.drop(columns='date_recorded', inplace=True) #Test Set #TEST SET TRANSFORM test_df = pd.read_csv('test_set_values.csv') #Fills in the mod for col in test_df.columns[test_df.isna().sum() > 0]: mode = test_df[col].mode()[0] test_df[col].fillna(value=mode, inplace=True) #dropping to_drop = [ 'funder', 'num_private', 'subvillage', 'region_code', 'recorded_by', 'source_type', 'waterpoint_type', 'scheme_name', 'payment_type', 'quantity_group' ] test_df.drop(columns=to_drop, inplace=True) #date column test_df['date_recorded'] = pd.to_datetime(test_df['date_recorded']) test_df['year_recorded'] = test_df['date_recorded'].dt.year test_df['month_recorded'] = test_df['date_recorded'].dt.month test_df.drop(columns='date_recorded', inplace=True) #target encode target = 'status_group' lst_te = [ 'wpt_name', 'basin', 'region', 'district_code', 'lga', 'ward', 'scheme_management', 'installer', 'source' ] #encoder = TargetEncoder() encoder = LeaveOneOutEncoder() te_everything = [ 'wpt_name', 'basin', 'region', 'district_code', 'lga', 'ward', 'scheme_management', 'installer', 'source', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'payment', 'water_quality', 'management_group', 'quality_group', 'quantity', 'source_class', 'waterpoint_type_group' ] for c in te_everything: df[str(c) + '_encoded'] = encoder.fit_transform( df[c].values, df[target]) # TRAINING SET test_df[str(c) + '_encoded'] = encoder.transform( test_df[c].values) # TEST SET df.drop(columns=c, inplace=True) # TRAINING SET test_df.drop(columns=c, inplace=True) # TEST SET # #one hot encode # encoder_ohe = OneHotEncoder(sparse=False) ohe = [ 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'payment', 'water_quality', 'management_group', 'quality_group', 'quantity', 'source_class', 'waterpoint_type_group' ] # #ONE HOT ENCODING TRAINING SET # df_new = df[ohe] # encoder_ohe.fit(df_new) # x = encoder_ohe.transform(df_new) # df1 = pd.DataFrame(x) # df = pd.concat([df, df1], axis=1) # df.drop(columns=ohe, inplace=True) # #ONE HOT ENCODING TEST SET # df_new1 = test_df[ohe] # x1 = encoder_ohe.transform(df_new1) # df2 = pd.DataFrame(x1) # test_df = pd.concat([test_df, df2], axis = 1) # test_df.drop(columns=ohe, inplace=True) return df, test_df