def CleanMissingValue(df_Comics): """ Clean missing values """ # Drop columns "EYE", "GSM" and "HAIR" (containing <25% missing values) df_Comics.drop(["EYE", "GSM", "HAIR"], axis=1, inplace=True) # Drop "ALIVE" and "YEAR OF FIRST APPEARANCE" missing values (containing <4% missings values) df_Comics.dropna(subset=["ALIVE", "YEAR OF FIRST APPEARANCE"], inplace=True) # Fill "NUMBER OF APPEARANCES" missing values with mean appearancesMean = df_Comics["NUMBER OF APPEARANCES"].mean() df_Comics["NUMBER OF APPEARANCES"] = df_Comics["NUMBER OF APPEARANCES"].fillna(appearancesMean) # Fill "GENDER", "IDENTITY TYPE" and "TEAM" missing values with most frequent value. imputer = CategoricalImputer() df_Comics["GENDER"] = imputer.fit_transform(df_Comics["GENDER"]) df_Comics["IDENTITY TYPE"] = imputer.fit_transform(df_Comics["IDENTITY TYPE"]) df_Comics["TEAM"] = imputer.fit_transform(df_Comics["TEAM"]) return df_Comics
class MostFrequentImputer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): self.imputer = CategoricalImputer() return self def transform(self, X): age_cats_imputed = pd.Series( self.imputer.fit_transform(X.Age_cats.copy())).astype('category') sex_imputed = pd.Series(self.imputer.fit_transform( X.Sex.copy())).astype('category') embarked_imputed = pd.Series( self.imputer.fit_transform(X.Embarked.copy())).astype('category') X.Sex = sex_imputed.cat.codes X.Embarked = embarked_imputed.cat.codes X.Age_cats = age_cats_imputed return X
def clean_impute(df): # This function consumes a dataframe and spits out a stripped, imputed dataframe data = df data = data.fillna(data.mean()) imputer = CategoricalImputer() imp = imputer.fit_transform(data.values) return imp
def preproccesing(data_sampled, pca_components = 5): """ Perform preprocessing of data and tranformationa and return the transformed data 1. removing NaN values if any 2. Converting categorical to continuous data variable 3. Label encoding and then One Hot Encoding 4. Performs PCA for dimenstions reduction """ features = { 'UniqueID':0, 'disbursed_amount':1,'asset_cost':2,'ltv':3,'branch_id':4,'supplier_id':5, 'manufacturer_id':6,'Current_pincode_ID':7,'Date.of.Birth':8,'Employment.Type':9,'DisbursalDate':10,'State_ID':11, 'Employee_code_ID':12,'MobileNo_Avl_Flag':13, 'Aadhar_flag':14,'PAN_flag':15,'VoterID_flag':16, 'Driving_flag':17, 'Passport_flag':18,'PERFORM_CNS.SCORE':19, 'PERFORM_CNS.SCORE.DESCRIPTION':20, 'PRI.NO.OF.ACCTS':21,'PRI.ACTIVE.ACCTS':22,'PRI.OVERDUE.ACCTS':23,'PRI.CURRENT.BALANCE':24, 'PRI.SANCTIONED.AMOUNT':25,'PRI.DISBURSED.AMOUNT':26,'SEC.NO.OF.ACCTS':27,'SEC.ACTIVE.ACCTS':28, 'SEC.OVERDUE.ACCTS':29, 'SEC.CURRENT.BALANCE':30, 'SEC.SANCTIONED.AMOUNT':31,'SEC.DISBURSED.AMOUNT':32, 'PRIMARY.INSTAL.AMT':33, 'SEC.INSTAL.AMT':34, 'NEW.ACCTS.IN.LAST.SIX.MONTHS':35, 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS':36, 'AVERAGE.ACCT.AGE':37,'CREDIT.HISTORY.LENGTH':38, 'NO.OF_INQUIRIES':39,'loan_default':40 } # Removing NaN values data_sampled = data_sampled.drop(['branch_id','supplier_id', 'manufacturer_id','Current_pincode_ID', 'State_ID', 'Employee_code_ID'], axis = 1) from sklearn_pandas import CategoricalImputer imputer = CategoricalImputer() data_sampled['Employment.Type'] = imputer.fit_transform(data_sampled['Employment.Type']) # Categorial to continuos import functools # Used for mapping a function with more than one argument data_sampled['Date.of.Birth'] = list(map(Utils.toDate, data_sampled['Date.of.Birth'])) data_sampled['Date.of.Birth'] = list(map(functools.partial(Utils.date_diff, date2 = datetime.datetime.today().date()), data_sampled['Date.of.Birth'])) data_sampled['DisbursalDate'] = list(map(Utils.toDate, data_sampled['DisbursalDate'])) data_sampled['DisbursalDate'] = list(map(functools.partial(Utils.date_diff, date2 = datetime.datetime.today().date()), data_sampled['DisbursalDate'])) data_sampled['AVERAGE.ACCT.AGE'] = list(map(Utils.total_span, data_sampled['AVERAGE.ACCT.AGE'] )) #1yrs 10mon = 1*12 +10 = 22 data_sampled['CREDIT.HISTORY.LENGTH'] = list(map(Utils.total_span, data_sampled['CREDIT.HISTORY.LENGTH'] )) #1yrs 10mon = 1*12 +10 = 22 x = data_sampled.iloc[:,0:34].values # Label encoding categorical values encoder = LabelEncoder() x[:,5] = encoder.fit_transform(x[:,5]) x[:,14] = encoder.fit_transform(x[:,14]) # Hot encoding new columns hot_encoder = OneHotEncoder(categorical_features=[5,14]) x = hot_encoder.fit_transform(x).toarray() #normalizing data (scaling the data) # scale = StandardScaler() # x = scale.fit_transform(x) # feature selection using random forest # feature selection using PCA # from sklearn.decomposition import PCA # pca = PCA(n_components=pca_components) # x = pca.fit_transform(x) return (x)
def encodeCategoricalValues(self,data): """ Method Name: encodeCategoricalValues Description: This method encodes all the categorical values in the training set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: Ajinkya Abhang Version: 1.0 Revisions: None """ # We can impute the categorical values like below: features_nan = [feature for feature in data.columns if data[feature].isnull().sum() > 0 and data[feature].dtypes == 'O'] imputer = CategoricalImputer() if len(features_nan) != 0: for cat_feature in features_nan: data[cat_feature] = imputer.fit_transform(data[cat_feature]) # We can impute the non-categorical values like below: numerical_with_nan = [feature for feature in data.columns if data[feature].isnull().sum() > 1 and data[feature].dtypes != 'O'] if len(numerical_with_nan) != 0: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) data[numerical_with_nan] = imputer.fit_transform(data[numerical_with_nan]) # We can use label encoder for encoding labelencoder = LabelEncoder() dummy_features = ['laundry_options', 'parking_options'] for feature in dummy_features: data[feature] = labelencoder.fit_transform(data[feature]) for feature in dummy_features: data_df = pd.get_dummies(data, columns=['laundry_options', 'parking_options'], drop_first=True) return data_df
def imputacion_variable_delegacion(X_train, X_test): " Esta funcion imputa la variable 'delegacion_inicio' con la moda " #Para el set de entrenamiento X = X_train.delegacion_inicio.values.reshape(X_train.shape[0], 1) delegacionInicio_imputer = CategoricalImputer(strategy='most_frequent') X_train['delegacion_inicio'] = delegacionInicio_imputer.fit_transform(X) #Para el set de prueba X = X_test.delegacion_inicio.values.reshape(X_test.shape[0], 1) X_test['delegacion_inicio'] = delegacionInicio_imputer.transform(X) return X_train, X_test
def test_default_fill_value_for_constant_strategy(input_type): data = ['a', np.nan, 'b', 'b'] if input_type == 'pd': X = pd.Series(data) else: X = np.asarray(data, dtype=object) imputer = CategoricalImputer(strategy='constant') Xt = imputer.fit_transform(X) assert imputer.fill_ == '?' assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
def test_missing_values_param(input_type): data = ['x', 'y', 'a_missing', 'y'] if input_type == 'pd': X = pd.Series(data) else: X = np.asarray(data, dtype=object) imp = CategoricalImputer(missing_values='a_missing') Xt = imp.fit_transform(X) assert (Xt == np.array(['x', 'y', 'y', 'y'])).all()
def fit_transform(self, df: 'dataframe') -> 'dataframe': """ Fill in missing categorical values using most frequent value """ # instantiate CategoricalImputer imputer = CategoricalImputer() # convert array to dataframe df_filled = df.apply(lambda x: imputer.fit_transform(x), axis=0) # return filled dataframe return df_filled
def test_copy_param(input_type): data = ['a', np.nan, 'b', 'a'] if input_type == 'pd': X = pd.Series(data) else: X = np.asarray(data, dtype=object) imp = CategoricalImputer(copy=False) Xt = imp.fit_transform(X) Xe = np.array(['a', 'a', 'b', 'a']) assert (Xt == Xe).all() assert (X == Xe).all()
def impute_categorical(data): data = data.replace('?', np.nan) cat_cols = data.select_dtypes(include=object) #print('Cat Cols Data') #pprint(cat_cols) cat_col_names = cat_cols.columns.values print('Categorical Columns') pprint(cat_col_names) partial_data = data.drop(columns=cat_col_names) from sklearn_pandas import CategoricalImputer ci = CategoricalImputer() for col in cat_col_names: try: col_data = ci.fit_transform(cat_cols[col].values) partial_data = pd.concat( [partial_data, pd.DataFrame(col_data, dtype=object)], axis=1) #pprint(partial_data) except: partial_data = pd.concat([partial_data, cat_cols[col]], axis=1) return partial_data
# =============================================================================''' data_missing = dataset.isnull().sum() print(data_missing) # Numpy array for imputing missing values X = dataset.iloc[:, :-1].values # ============================================================================= ## Missing Categorical Values # ============================================================================= from sklearn_pandas import CategoricalImputer data = np.array(X[:,8], dtype=object) imputer = CategoricalImputer() X[:,8] = imputer.fit_transform(data) dataset['Outlet_Size'] = X[:,8] # ============================================================================= # # Imputer for numeric values # ============================================================================= from sklearn.preprocessing import Imputer imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) X[:, 1:2] = imputer.fit_transform(X[:, 1:2]) dataset['Item_Weight'] = X[:,1:2] # Check Values in Item Visibilty dataset.Item_Visibility.value_counts() # Replace 0 with NaN dataset['Item_Visibility'].replace(0.000000, np.nan, inplace=True)
plt.show() ##bivariate sns.boxplot(x='Interest_Rate', y='Total_Accounts', data=Data) plt.show() ##Impute the data Data = pd.concat([TrainData, TestData], axis=0) from sklearn_pandas import CategoricalImputer imputer = CategoricalImputer() # imputing the missing values from the column Data['Home_Owner'] = imputer.fit_transform(Data['Home_Owner']) Data['Length_Employed'] = imputer.fit_transform(Data['Length_Employed']) Data['Months_Since_Deliquency'].fillna(0, inplace=True) Data['Annual_Income'].fillna(Data['Annual_Income'].mean(), inplace=True) Data['Debt_amount'] = Data['Debt_To_Income'] * (Data['Annual_Income'] / 12) Data['Debt_amount'].fillna(Data['Debt_amount'].mean(), inplace=True) cat_df = Data.select_dtypes(include=['object']).copy() cat_df.columns from sklearn.preprocessing import LabelEncoder label_encoder = LabelEncoder() for column in cat_df.columns: cat_df[column] = label_encoder.fit_transform(cat_df[column])
esd_df = pd.read_csv('/Users/encoreai/Desktop/new1.csv', encoding='iso-8859-1', sep=',', engine='python') list(esd_df.columns) esd_df.shape #Finding out the null / Nan values in the columns: # for _ in esd_df.columns: # print("The number of null values in:{} == {}".format(_, esd_df[_].isnull().sum())) esd_array = esd_df['Doc_type'].values imputer = CategoricalImputer() imputer.fit_transform(esd_array) esd_df["Error_detail"].fillna("No detail", inplace=True) #print(esd_df) esd_df = esd_df.drop(["Doc_type"], axis=1) esd_df['Doc_type'] = esd_array esd = esd_df.copy() encoder_tc = ce.BinaryEncoder(cols=['Ticket_Category']) df_tc = encoder_tc.fit_transform(esd) encoder_et = ce.BinaryEncoder(cols=['Error_type']) df_et = encoder_et.fit_transform(df_tc) encoder_ed = ce.BinaryEncoder(cols=['Error_detail']) df_ed = encoder_ed.fit_transform(df_et)
class Preprocessor: def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object def replaceInvalidValuesWithNull(self, data): for column in data.columns: count = data[column][data[column] == '?'].count() if count != 0: data[column] = data[column].replace('?', np.NaN) return data def is_null_present(self, data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns True if null values are present in the DataFrame, False if they are not present and returns the list of columns for which null values are present. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False self.cols_with_missing_values = [] self.cols = data.columns try: self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in range(len(self.null_counts)): if self.null_counts[i] > 0: self.null_present = True self.cols_with_missing_values.append(self.cols[i]) if (self.null_present ): # write the logs to see which columns have null values self.dataframe_with_null = pd.DataFrame() self.dataframe_with_null['columns'] = data.columns self.dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) self.dataframe_with_null.to_csv( 'preprocessing_data/null_values.csv' ) # storing the null column information to file self.logger_object.log( self.file_object, 'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class' ) return self.null_present, self.cols_with_missing_values except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Finding missing values failed. Exited the is_null_present method of the Preprocessor class' ) raise Exception() def impute_missing_values(self, data, cols_with_missing_values): self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data self.cols_with_missing_values = cols_with_missing_values try: self.imputer = CategoricalImputer() for col in self.cols_with_missing_values: self.data[col] = self.imputer.fit_transform(self.data[col]) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception() def separate_label_feature(self, data, label_column_name): self.logger_object.log( self.file_object, 'Entered the separate_label_feature method of the Preprocessor class' ) try: self.X = data.drop(labels=label_column_name, axis=1) self.Y = data[label_column_name] self.logger_object.log( self.file_object, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class' ) return self.X, self.Y except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class' ) raise Exception()
def impute_categorical(df, col_name): imputer = CategoricalImputer() df[col_name] = imputer.fit_transform(df[col_name]) return df
def _impute_data(df: pd.DataFrame, categorical_all: bool = False, categorical_subset: list = None) -> pd.DataFrame: """Imputes missing numerical or categorical values if the percentage of rows containing NaN's is > 5%. Else, returns a dataframe without those rows. Usage: ------- dataframe_no_nan = impute_data(dataframe_with_nan) """ # try to infer object types, as this will make calculating numeric columns much easier df = df.infer_objects() # If there are very few missing values (<= 5%), then just drop those rows and return the DataFrame, as # it should be enough for the provided plots if df.isna().sum().sum() / df.shape[0] <= 0.05: return df.dropna().get_dummies() catimpute = CategoricalImputer() if categorical_all is True and categorical_subset is not None: warnings.warn( "categorical_all and subset both specified ... using subset and continuing" ) categorical_all = False # Try and make dummies for all categorical columns if categorical_all: likely_categorical_cols = [] for col in df.columns: if is_likely_categorical(df[col]): df[col] = pd.get_dummies(data=df[col]) df[col] = catimpute.fit_transform(df[col]) likely_categorical_cols.append(col) if len(likely_categorical_cols) > 0: # Grammatically correct if len(likely_categorical_cols) > 1: warnings.warn( "Columns {} are likely categorical, creating dummies. Run with categorical=False (to disable all) or categorical_subset=[column names] to disable warning" .format(likely_categorical_cols)) else: warnings.warn( "Column \"{}\" is likely categorical, creating dummies. Run with categorical=False (to disable all) or categorical_subset=[column names] to disable warning" .format(likely_categorical_cols[0])) # Or only make dummies for specified columns if categorical_subset is not None: for col in categorical_subset: # NaN's should be ignored here df[col] = pd.get_dummies(data=df[col]) df[col] = catimpute.fit_transform(df[col]) df.infer_objects() for col in df.columns: if df[col].isna().sum() > 0: if _is_numeric(df[col]): # fill using mean TODO: allow this to be specified df[col].fillna(df[col].mean(), inplace=True) else: warnings.warn( "Column \"{}\" cannot be made numeric, dropping and continuing. If this is incorrect, specify it as categorical or transform to a numeric dtype" .format(col)) df.drop(col, axis=1, inplace=True) return df
def predict(): print("__________________________") import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn_pandas import CategoricalImputer import os as os import category_encoders as ce from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score from sklearn.metrics import matthews_corrcoef from sklearn.externals import joblib from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import make_pipeline import warnings warnings.filterwarnings("ignore") esd_df = pd.read_csv('/Users/encoreai/Desktop/new1.csv', encoding='iso-8859-1', sep=',', engine='python') list(esd_df.columns) esd_df.shape #Finding out the null / Nan values in the columns: # for _ in esd_df.columns: # print("The number of null values in:{} == {}".format(_, esd_df[_].isnull().sum())) esd_array = esd_df['Doc_type'].values imputer = CategoricalImputer() imputer.fit_transform(esd_array) esd_df["Error_detail"].fillna("No detail", inplace = True) #print(esd_df) esd_df=esd_df.drop(["Doc_type"],axis=1) esd_df['Doc_type'] = esd_array esd = esd_df.copy() encoder_tc = ce.BinaryEncoder(cols=['Ticket_Category']) df_tc = encoder_tc.fit_transform(esd) encoder_et = ce.BinaryEncoder(cols=['Error_type']) df_et = encoder_et.fit_transform(df_tc) encoder_ed = ce.BinaryEncoder(cols=['Error_detail']) df_ed = encoder_ed.fit_transform(df_et) encoder_dt = ce.BinaryEncoder(cols=['Doc_type']) df_dt = encoder_dt.fit_transform(df_ed) #Next step is creating training and testing datasets: x=df_dt.drop(['Resolution'],axis='columns') x.shape y=df_dt['Resolution'] y.shape from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1) # print(x_train.shape,x_test.shape,y_train.shape,y_test.shape) rf1=RandomForestClassifier(criterion='entropy',n_estimators=100,max_features=3,oob_score=True,bootstrap=True,n_jobs=-1,random_state=1) #Model fit rf1.fit(x_train,y_train) row = x_test.head(1) # print(row) rf1_pred=rf1.predict(x_test) # print(rf1_pred) from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score from sklearn.metrics import matthews_corrcoef # Finding Accracy Score # print('Accuracy Score:',accuracy_score(y_test,rf1_pred)) # Matthews Corealation Coefficient mcc = matthews_corrcoef(y_test,rf1_pred) # print('Matthews_corrcoef for Model is:',mcc) #Feature importances features=df_dt.columns[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22]] importances = rf1.feature_importances_ indices = np.argsort(importances) plt.figure(1) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), features[indices]) plt.xlabel('Relative Importance') oob_error=1-rf1.oob_score_ # print(oob_error) #0.150 params={ 'criterion':['gini','entropy'], 'n_estimators':[50], 'max_features':[2,3,4,5,6,7,8], } rf_gridcv=GridSearchCV(estimator=rf1,cv=5,param_grid=params,scoring='accuracy') rf_grid=rf_gridcv.fit(x_train,y_train) # print(rf_gridcv.best_params_) y_predrf=rf_gridcv.predict(x_test) # print(y_predrf) one_row1 = x_test.head(2) y_pred_one=rf_gridcv.predict(one_row1) # print(y_pred_one) x = x_test.head(1) # print(x) import pickle pickle.dump(rf_gridcv, open('model.pkl','wb')) model = pickle.load(open('model.pkl','rb')) print(model.predict(x)) print("***********************************************") # 'Ticket_Category_0' = 0 # 'Ticket_Category_1' = 1 # 'Ticket_Category_2' = 1 # 'Ticket_Category_3' = 0 # 'Error_type_0' = 0 # 'Error_type_1' = 1 # 'Error_type_2' = 0 # 'Error_type_3' = 0 # 'Error_type_4' = 1 # 'Error_type_5' = 0 # 'Error_type_6' = 0 # 'Error_detail_0' = 0 # 'Error_detail_1' = 0 # 'Error_detail_2' = 0 # 'Error_detail_3' = 0 # 'Error_detail_4' = 0 # 'Error_detail_5' = 1 # 'Doc_type_0' = 0 # 'Doc_type_1' = 0 # 'Doc_type_2' = 1 # 'Doc_type_3' = 1 # 'Doc_type_4' = 1 # # query = [0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,1] # prediction = model.predict(query) # prediction = jsonify({'prediction': list(prediction)}) # print(prediction) prediction = model.predict(x) return jsonify({'prediction': list(prediction)})
df.describe().transpose() df.info() df.describe(include='O') #Count missing values df.isna().sum() #Drop unwanted column df=df.drop(['Loan_ID'],axis=1) #impute 'catagorical varibles' ..impute gender df['Gender'].value_counts(dropna=False) #gives na clunts for gender seperately from sklearn_pandas import CategoricalImputer imputer=CategoricalImputer() df['Gender']=imputer.fit_transform(df['Gender']) df['Married'].value_counts(dropna=False) df['Married']=imputer.fit_transform(df['Married']) df['Dependents'].value_counts(dropna=False) df['Dependents']=imputer.fit_transform(df['Dependents']) df['Self_Employed'].value_counts(dropna=False) df['Self_Employed']=imputer.fit_transform(df['Self_Employed']) df['Credit_History'].value_counts(dropna=False) df['Credit_History']=imputer.fit_transform(df['Credit_History']) df.isna().sum() #only numeric data impute #impute loamAmount df['LoanAmount'].isna().sum() df['LoanAmount'].describe()
from sklearn.impute import SimpleImputer imp = SimpleImputer(strategy='mean') carsImputed = imp.fit_transform(dum_cars_miss) df_carsImputed = pd.DataFrame(carsImputed, columns= dum_cars_miss.columns) dum_cars_miss.shape carsImputed.shape df_carsImputed.shape # Categorical Imputing from sklearn_pandas import CategoricalImputer data = np.array(['a', 'b', 'b', np.nan], dtype=object) imputer = CategoricalImputer() imputer.fit_transform(data) from sklearn_pandas import CategoricalImputer data = np.array(['a', 'b', 'b', np.nan], dtype=object) imputer = CategoricalImputer(strategy='constant',fill_value="Baby") imputer.fit_transform(data) import numpy as np milk = pd.read_csv("F:/Python Material/Python Course/Datasets/milk.csv",index_col=0) milk.head() np.mean(milk), np.std(milk) from sklearn.preprocessing import StandardScaler scaler = StandardScaler()
class Preprocessor: """ This class shall be used to clean and transform the data before training. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object def remove_columns(self, data, columns): """ Method Name: remove_columns Description: This method removes the given columns from a pandas dataframe. Output: A pandas DataFrame after removing the specified columns. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the remove_columns method of the Preprocessor class') self.data = data self.columns = columns try: self.useful_data = self.data.drop( labels=self.columns, axis=1) # drop the labels specified in the columns self.logger_object.log( self.file_object, 'Column removal Successful.Exited the remove_columns method of the Preprocessor class' ) return self.useful_data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in remove_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class' ) raise Exception() def separate_label_feature(self, data, label_column_name): """ Method Name: separate_label_feature Description: This method separates the features and a Label Coulmns. Output: Returns two separate Dataframes, one containing features and the other containing Labels . On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the separate_label_feature method of the Preprocessor class' ) try: self.X = data.drop( labels=label_column_name, axis=1 ) # drop the columns specified and separate the feature columns self.Y = data[label_column_name] # Filter the Label columns self.logger_object.log( self.file_object, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class' ) return self.X, self.Y except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class' ) raise Exception() def dropUnnecessaryColumns(self, data, columnNameList): """ Method Name: is_null_present Description: This method drops the unwanted columns as discussed in EDA section. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ data = data.drop(columnNameList, axis=1) return data def replaceInvalidValuesWithNull(self, data): """ Method Name: is_null_present Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ for column in data.columns: count = data[column][data[column] == '?'].count() if count != 0: data[column] = data[column].replace('?', np.nan) return data def is_null_present(self, data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns True if null values are present in the DataFrame, False if they are not present and returns the list of columns for which null values are present. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False self.cols_with_missing_values = [] self.cols = data.columns try: self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in range(len(self.null_counts)): if self.null_counts[i] > 0: self.null_present = True self.cols_with_missing_values.append(self.cols[i]) if (self.null_present ): # write the logs to see which columns have null values self.dataframe_with_null = pd.DataFrame() self.dataframe_with_null['columns'] = data.columns self.dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) self.dataframe_with_null.to_csv( 'preprocessing_data/null_values.csv' ) # storing the null column information to file self.logger_object.log( self.file_object, 'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class' ) return self.null_present, self.cols_with_missing_values except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Finding missing values failed. Exited the is_null_present method of the Preprocessor class' ) raise Exception() def encodeCategoricalValues(self, data): """ Method Name: encodeCategoricalValues Description: This method encodes all the categorical values in the training set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ data["class"] = data["class"].map({'p': 1, 'e': 2}) for column in data.drop(['class'], axis=1).columns: data = pd.get_dummies(data, columns=[column]) return data def encodeCategoricalValuesPrediction(self, data): """ Method Name: encodeCategoricalValuesPrediction Description: This method encodes all the categorical values in the prediction set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ for column in data.columns: data = pd.get_dummies(data, columns=[column]) return data # def handleImbalanceDataset(self,X,Y): # """ # Method Name: handleImbalanceDataset # Description: This method handles the imbalance in the dataset by oversampling. # Output: A Dataframe which is balanced now. # On Failure: Raise Exception # # Written By: iNeuron Intelligence # Version: 1.0 # Revisions: None # """ # # # # rdsmple = RandomOverSampler() # x_sampled, y_sampled = rdsmple.fit_sample(X, Y) # # return x_sampled,y_sampled def impute_missing_values(self, data, cols_with_missing_values): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data self.cols_with_missing_values = cols_with_missing_values try: self.imputer = CategoricalImputer() for col in self.cols_with_missing_values: self.data[col] = self.imputer.fit_transform(self.data[col]) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception() def get_columns_with_zero_std_deviation(self, data): """ Method Name: get_columns_with_zero_std_deviation Description: This method finds out the columns which have a standard deviation of zero. Output: List of the columns with standard deviation of zero On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class' ) self.columns = data.columns self.data_n = data.describe() self.col_to_drop = [] try: for x in self.columns: if (self.data_n[x]['std'] == 0 ): # check if standard deviation is zero self.col_to_drop.append( x ) # prepare the list of columns with standard deviation zero self.logger_object.log( self.file_object, 'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class' ) return self.col_to_drop except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class' ) raise Exception()
class Preprocessor: """ This class shall be used to clean and transform the data before training. """ def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object def remove_unwanted_spaces(self, data): """ Method Name: remove_unwanted_spaces Description: This method removes the unwanted spaces from a pandas dataframe. Output: A pandas DataFrame after removing the spaces. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the remove_unwanted_spaces method of the Preprocessor class' ) self.data = data try: self.df_without_spaces = self.data.apply( lambda x: x.str.strip() if x.dtype == "object" else x ) # drop the labels specified in the columns self.logger_object.log( self.file_object, 'Unwanted spaces removal Successful.Exited the remove_unwanted_spaces method of the Preprocessor class' ) return self.df_without_spaces except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in remove_unwanted_spaces method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'unwanted space removal Unsuccessful. Exited the remove_unwanted_spaces method of the Preprocessor class' ) raise Exception() def remove_columns(self, data, columns): """ Method Name: remove_columns Description: This method removes the given columns from a pandas dataframe. Output: A pandas DataFrame after removing the specified columns. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the remove_columns method of the Preprocessor class') self.data = data self.columns = columns try: self.useful_data = self.data.drop( labels=self.columns, axis=1) # drop the labels specified in the columns self.logger_object.log( self.file_object, 'Column removal Successful.Exited the remove_columns method of the Preprocessor class' ) return self.useful_data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in remove_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class' ) raise Exception() def separate_label_feature(self, data, label_column_name): """ Method Name: separate_label_feature Description: This method separates the features and a Label Coulmns. Output: Returns two separate Dataframes, one containing features and the other containing Labels . On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the separate_label_feature method of the Preprocessor class' ) try: self.X = data.drop( labels=label_column_name, axis=1 ) # drop the columns specified and separate the feature columns self.Y = data[label_column_name] # Filter the Label columns self.logger_object.log( self.file_object, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class' ) return self.X, self.Y except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class' ) raise Exception() def is_null_present(self, data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns True if null values are present in the DataFrame, False if they are not present and returns the list of columns for which null values are present. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False self.cols_with_missing_values = [] self.cols = data.columns try: self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in range(len(self.null_counts)): if self.null_counts[i] > 0: self.null_present = True self.cols_with_missing_values.append(self.cols[i]) if (self.null_present ): # write the logs to see which columns have null values self.dataframe_with_null = pd.DataFrame() self.dataframe_with_null['columns'] = data.columns self.dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) self.dataframe_with_null.to_csv( 'preprocessing_data/null_values.csv' ) # storing the null column information to file self.logger_object.log( self.file_object, 'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class' ) return self.null_present, self.cols_with_missing_values except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Finding missing values failed. Exited the is_null_present method of the Preprocessor class' ) raise Exception() def impute_missing_values(self, data, cols_with_missing_values): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data self.cols_with_missing_values = cols_with_missing_values try: self.imputer = CategoricalImputer() for col in self.cols_with_missing_values: self.data[col] = self.imputer.fit_transform(self.data[col]) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception() def scale_numerical_columns(self, data): """ Method Name: scale_numerical_columns Description: This method scales the numerical values using the Standard scaler. Output: A dataframe with scaled On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the scale_numerical_columns method of the Preprocessor class' ) self.data = data try: self.num_df = self.data.select_dtypes(include=['int64']).copy() self.scaler = StandardScaler() self.scaled_data = self.scaler.fit_transform(self.num_df) self.scaled_num_df = pd.DataFrame(data=self.scaled_data, columns=self.num_df.columns) self.logger_object.log( self.file_object, 'scaling for numerical values successful. Exited the scale_numerical_columns method of the Preprocessor class' ) return self.scaled_num_df except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in scale_numerical_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'scaling for numerical columns Failed. Exited the scale_numerical_columns method of the Preprocessor class' ) raise Exception() def encode_categorical_columns(self, data): """ Method Name: encode_categorical_columns Description: This method encodes the categorical values to numeric values. Output: only the columns with categorical values converted to numerical values On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the encode_categorical_columns method of the Preprocessor class' ) try: self.cat_df = data.select_dtypes(include=['object']).copy() # Using the dummy encoding to encode the categorical columns to numericsl ones for col in self.cat_df.columns: self.cat_df = pd.get_dummies(self.cat_df, columns=[col], prefix=[col], drop_first=True) self.logger_object.log( self.file_object, 'encoding for categorical values successful. Exited the encode_categorical_columns method of the Preprocessor class' ) return self.cat_df except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in encode_categorical_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'encoding for categorical columns Failed. Exited the encode_categorical_columns method of the Preprocessor class' ) raise Exception() def handle_imbalanced_dataset(self, x, y): """ Method Name: handle_imbalanced_dataset Description: This method handles the imbalanced dataset to make it a balanced one. Output: new balanced feature and target columns On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the handle_imbalanced_dataset method of the Preprocessor class' ) try: self.rdsmple = RandomOverSampler() self.x_sampled, self.y_sampled = self.rdsmple.fit_sample(x, y) self.logger_object.log( self.file_object, 'dataset balancing successful. Exited the handle_imbalanced_dataset method of the Preprocessor class' ) return self.x_sampled, self.y_sampled except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in handle_imbalanced_dataset method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'dataset balancing Failed. Exited the handle_imbalanced_dataset method of the Preprocessor class' ) raise Exception()
"""Deal With Missing Data The missingno library provides a neat way to showcase which variables have missing data. This is done below using a bar chart. I will then proceed to use Pandas fillna method to fill the two columns that have missing data (Item_Weight, Outlet_Size) """ msno.bar(train_data) msno.bar(test_data) train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean(), inplace=True) test_data['Item_Weight'].fillna(test_data['Item_Weight'].mean(), inplace=True) outlet_size_tr = train_data['Outlet_Size'] outlet_size_ts = test_data['Outlet_Size'] imputer1 = CategoricalImputer() outlet_size_tr = imputer1.fit_transform(outlet_size_tr) outlet_size_ts = imputer1.fit_transform(outlet_size_ts) train_data = train_data.drop(['Outlet_Size'], axis=1) train_data.insert(8, 'Outlet_Size', outlet_size_tr) test_data = test_data.drop(['Outlet_Size'], axis=1) test_data.insert(8, 'Outlet_Size', outlet_size_ts) # Let's see if there are any columns we can drop cor = train_data.corr() cor["Item_Outlet_Sales"].sort_values(ascending=False) # The year that an outlet was established has a very low correlation figure
def fill_empty(frame): imputer = CategoricalImputer() return frame.apply(lambda x: imputer.fit_transform(x), axis=0)
sns.heatmap(df_vis.isnull(), cbar=False) df_vis.isnull().sum() df_vis.isna().sum() df_vis['HandsetPrice'] = df_vis['HandsetPrice'].replace('Unknown', -1) df_vis['HandsetPrice'] df_vis['HandsetPrice'] = df_vis['HandsetPrice'].astype('int') df_vis['HandsetPrice'] = df_vis['HandsetPrice'].replace(-1, np.NaN) df_vis['HandsetPrice'] df_vis.Churn.value_counts() #Missing values imputation temp = df_vis temp = temp.fillna(temp.mean()) imputer = CategoricalImputer(missing_values='NaN', strategy='most_frequent') imputer.fit_transform(temp['ServiceArea']) temp = temp.apply(lambda x: x.fillna(x.value_counts().index[0])) temp.isna().sum() #Statistical analysis nr, nc = temp.shape for j in range(nc): if ((temp.iloc[:, j].dtype != np.int64) & (temp.iloc[:, j].dtype != np.number)): xx = temp.iloc[:, j] yy = temp['Churn'] ct = pd.crosstab(xx, yy) ch = chi2_contingency(ct) print('Chisquare result for', temp.columns[j], 'is ', ch) else: print('Not a category')
def doprediction(): info = request.data json_data = json.loads(info) meldrange = json_data["meldrange"] meldrange = float(meldrange) donor_data = json_data["donor"] dolen = len(donor_data) allrecip_data = json_data["allrecip"] allrecip_len = len(allrecip_data) donor_df = pd.DataFrame(data=donor_data[1:dolen], columns=donor_data[0]) allrecip_df = pd.DataFrame(data=allrecip_data[1:allrecip_len], columns=allrecip_data[0]) filename = 'datafile/donorfile.csv' filename2 = 'datafile/recipfile.csv' silentremove(filename) silentremove(filename2) donor_df.to_csv(filename, encoding='utf-8') allrecip_df.to_csv(filename2, encoding='utf-8') # start to impute -------------------------------------- donor_df = pd.read_csv('datafile/donorfile.csv', index_col=0) recipient_df = pd.read_csv('datafile/recipfile.csv', index_col=0) id_df = pd.DataFrame( recipient_df[['recipient_id', 'FINAL_MELD_PELD_LAB_SCORE']]) X_cf_r = recipient_df.select_dtypes(include=['object']) X_ncf_r = recipient_df.select_dtypes(exclude=['object']) X_cf_d = donor_df.select_dtypes(include=['object']) X_ncf_d = donor_df.select_dtypes(exclude=['object']) imp_cat = CategoricalImputer() X_cf_r = pd.DataFrame(imp_cat.fit_transform(np.array(X_cf_r)), columns=X_cf_r.columns) imp_cat = CategoricalImputer() X_cf_d = pd.DataFrame(imp_cat.fit_transform(np.array(X_cf_d)), columns=X_cf_d.columns) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X_ncf_r) X_ncf_r = pd.DataFrame(imp.transform(X_ncf_r), columns=X_ncf_r.columns) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X_ncf_d) X_ncf_d = pd.DataFrame(imp.transform(X_ncf_d), columns=X_ncf_d.columns) recipient_df = pd.merge(X_ncf_r, X_cf_r, left_index=True, right_index=True) # donor_df = pd.merge(X_ncf_d, X_cf_d, left_index=True, right_index=True) if meldrange != 200: id_df = id_df.loc[(id_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & ( id_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20)] recipient_df = recipient_df.loc[ (recipient_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & (recipient_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20.0)] X_cf_r = recipient_df.select_dtypes(include=['object']) X_ncf_r = recipient_df.select_dtypes(exclude=['object']) min_max_scaler = preprocessing.MinMaxScaler() header = X_ncf_d.columns X_ncf_d = min_max_scaler.fit_transform(X_ncf_d) X_ncf_d = pd.DataFrame(X_ncf_d, columns=header) min_max_scaler = preprocessing.MinMaxScaler() header = X_ncf_r.columns X_ncf_r = min_max_scaler.fit_transform(X_ncf_r) X_ncf_r = pd.DataFrame(X_ncf_r, columns=header) X_ncf_r.index = X_cf_r.index recipient_df = pd.merge(X_ncf_r, X_cf_r, left_index=True, right_index=True) print("recipdf", recipient_df) donor_df = pd.merge(X_ncf_d, X_cf_d, left_index=True, right_index=True) filename = 'datafile/donorfile.csv' filename2 = 'datafile/recipfile.csv' filename3 = 'datafile/recipidfile.csv' silentremove(filename) silentremove(filename2) silentremove(filename3) donor_df.to_csv(filename, encoding='utf-8') print("meldrange", meldrange) # if meldrange!=200: # id_df = id_df.loc[(id_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & (id_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20)] # recipient_df = recipient_df.loc[(recipient_df['FINAL_MELD_PELD_LAB_SCORE']<meldrange) & (recipient_df['FINAL_MELD_PELD_LAB_SCORE']>= meldrange-20.0)] id_df = pd.DataFrame(id_df['recipient_id'], columns=['recipient_id']) recipient_df.to_csv(filename2, encoding='utf-8') id_df.to_csv(filename3, encoding='utf-8') import prediction match_score = prediction.matching() predict_score = prediction.predictscore() return json.dumps({'match': match_score, 'predict': predict_score})
class Preprocessor: """ This class shall be used to clean and transform the data before training. """ def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object def remove_unwanted_spaces(self, data): """ Method Name: remove_unwanted_spaces Description: This method removes the unwanted spaces from a pandas dataframe. Output: A pandas DataFrame after removing the spaces. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the remove_unwanted_spaces method of the Preprocessor class' ) self.data = data try: self.df_without_spaces = self.data.apply( lambda x: x.str.strip() if x.dtype == "object" else x ) # drop the labels specified in the columns self.logger_object.log( self.file_object, 'Unwanted spaces removal Successful.Exited the remove_unwanted_spaces method of the Preprocessor class' ) return self.df_without_spaces except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in remove_unwanted_spaces method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'unwanted space removal Unsuccessful. Exited the remove_unwanted_spaces method of the Preprocessor class' ) raise Exception() def remove_columns(self, data, columns): """ Method Name: remove_columns Description: This method removes the given columns from a pandas dataframe. Output: A pandas DataFrame after removing the specified columns. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the remove_columns method of the Preprocessor class') self.data = data self.columns = columns try: self.useful_data = self.data.drop( labels=self.columns, axis=1) # drop the labels specified in the columns self.logger_object.log( self.file_object, 'Column removal Successful.Exited the remove_columns method of the Preprocessor class' ) return self.useful_data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in remove_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class' ) raise Exception() def separate_label_feature(self, data, label_column_name): """ Method Name: separate_label_feature Description: This method separates the features and a Label Coulmns. Output: Returns two separate Dataframes, one containing features and the other containing Labels . On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the separate_label_feature method of the Preprocessor class' ) try: self.X = data.drop( labels=label_column_name, axis=1 ) # drop the columns specified and separate the feature columns self.Y = data[label_column_name] # Filter the Label columns self.logger_object.log( self.file_object, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class' ) return self.X, self.Y except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class' ) raise Exception() def is_null_present(self, data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns True if null values are present in the DataFrame, False if they are not present and returns the list of columns for which null values are present. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False self.cols_with_missing_values = [] self.cols = data.columns try: self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in range(len(self.null_counts)): if self.null_counts[i] > 0: self.null_present = True self.cols_with_missing_values.append(self.cols[i]) if (self.null_present ): # write the logs to see which columns have null values self.dataframe_with_null = pd.DataFrame() self.dataframe_with_null['columns'] = data.columns self.dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) self.dataframe_with_null.to_csv( 'preprocessing_data/null_values.csv' ) # storing the null column information to file self.logger_object.log( self.file_object, 'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class' ) return self.null_present, self.cols_with_missing_values except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Finding missing values failed. Exited the is_null_present method of the Preprocessor class' ) raise Exception() def impute_missing_values(self, data, cols_with_missing_values): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data self.cols_with_missing_values = cols_with_missing_values try: self.imputer = CategoricalImputer() for col in self.cols_with_missing_values: self.data[col] = self.imputer.fit_transform(self.data[col]) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception() def scale_numerical_columns(self, data): """ Method Name: scale_numerical_columns Description: This method scales the numerical values using the Standard scaler. Output: A dataframe with scaled values On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the scale_numerical_columns method of the Preprocessor class' ) self.data = data self.num_df = self.data[[ 'months_as_customer', 'policy_deductable', 'umbrella_limit', 'capital-gains', 'capital-loss', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim', 'vehicle_claim' ]] try: self.scaler = StandardScaler() self.scaled_data = self.scaler.fit_transform(self.num_df) self.scaled_num_df = pd.DataFrame(data=self.scaled_data, columns=self.num_df.columns, index=self.data.index) self.data.drop(columns=self.scaled_num_df.columns, inplace=True) self.data = pd.concat([self.scaled_num_df, self.data], axis=1) self.logger_object.log( self.file_object, 'scaling for numerical values successful. Exited the scale_numerical_columns method of the Preprocessor class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in scale_numerical_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'scaling for numerical columns Failed. Exited the scale_numerical_columns method of the Preprocessor class' ) raise Exception() def encode_categorical_columns(self, data): """ Method Name: encode_categorical_columns Description: This method encodes the categorical values to numeric values. Output: dataframe with categorical values converted to numerical values On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the encode_categorical_columns method of the Preprocessor class' ) self.data = data try: self.cat_df = self.data.select_dtypes(include=['object']).copy() self.cat_df['policy_csl'] = self.cat_df['policy_csl'].map({ '100/300': 1, '250/500': 2.5, '500/1000': 5 }) self.cat_df['insured_education_level'] = self.cat_df[ 'insured_education_level'].map({ 'JD': 1, 'High School': 2, 'College': 3, 'Masters': 4, 'Associate': 5, 'MD': 6, 'PhD': 7 }) self.cat_df['incident_severity'] = self.cat_df[ 'incident_severity'].map({ 'Trivial Damage': 1, 'Minor Damage': 2, 'Major Damage': 3, 'Total Loss': 4 }) self.cat_df['insured_sex'] = self.cat_df['insured_sex'].map({ 'FEMALE': 0, 'MALE': 1 }) self.cat_df['property_damage'] = self.cat_df[ 'property_damage'].map({ 'NO': 0, 'YES': 1 }) self.cat_df['police_report_available'] = self.cat_df[ 'police_report_available'].map({ 'NO': 0, 'YES': 1 }) try: # code block for training self.cat_df['fraud_reported'] = self.cat_df[ 'fraud_reported'].map({ 'N': 0, 'Y': 1 }) self.cols_to_drop = [ 'policy_csl', 'insured_education_level', 'incident_severity', 'insured_sex', 'property_damage', 'police_report_available', 'fraud_reported' ] except: # code block for Prediction self.cols_to_drop = [ 'policy_csl', 'insured_education_level', 'incident_severity', 'insured_sex', 'property_damage', 'police_report_available' ] # Using the dummy encoding to encode the categorical columns to numerical ones for col in self.cat_df.drop(columns=self.cols_to_drop).columns: self.cat_df = pd.get_dummies(self.cat_df, columns=[col], prefix=[col], drop_first=True) self.data.drop( columns=self.data.select_dtypes(include=['object']).columns, inplace=True) self.data = pd.concat([self.cat_df, self.data], axis=1) self.logger_object.log( self.file_object, 'encoding for categorical values successful. Exited the encode_categorical_columns method of the Preprocessor class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in encode_categorical_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'encoding for categorical columns Failed. Exited the encode_categorical_columns method of the Preprocessor class' ) raise Exception() def handle_imbalanced_dataset(self, x, y): """ Method Name: handle_imbalanced_dataset Description: This method handles the imbalanced dataset to make it a balanced one. Output: new balanced feature and target columns On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the handle_imbalanced_dataset method of the Preprocessor class' ) try: self.rdsmple = RandomOverSampler() self.x_sampled, self.y_sampled = self.rdsmple.fit_sample(x, y) self.logger_object.log( self.file_object, 'dataset balancing successful. Exited the handle_imbalanced_dataset method of the Preprocessor class' ) return self.x_sampled, self.y_sampled except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in handle_imbalanced_dataset method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'dataset balancing Failed. Exited the handle_imbalanced_dataset method of the Preprocessor class' ) raise Exception()
def transform(self, X): for var in config.CAT_FEATURES: imputer = CategoricalImputer() X[var] = imputer.fit_transform(X[var]) return X
def encodeCategoricalValuesPrediction(self,data): """ Method Name: encodeCategoricalValuesPrediction Description: This method encodes all the categorical values in the prediction set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: Ajinkya Abhang Version: 1.0 Revisions: None """ # We can impute the categorical values like below: features_nan = [feature for feature in data.columns if data[feature].isnull().sum() > 0 and data[feature].dtypes == 'O'] imputer = CategoricalImputer() if len(features_nan) != 0: for cat_feature in features_nan: data[cat_feature] = imputer.fit_transform(data[cat_feature]) # We can impute the non-categorical values like below: numerical_with_nan = [feature for feature in data.columns if data[feature].isnull().sum() > 1 and data[feature].dtypes != 'O'] if len(numerical_with_nan) != 0: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) data[numerical_with_nan] = imputer.fit_transform(data[numerical_with_nan]) # We can use label encoder for encoding df_new = pd.DataFrame({ 'laundry_options_1': [np.nan] * data.shape[0], 'laundry_options_2': [np.nan] * data.shape[0], 'laundry_options_3': [np.nan] * data.shape[0], 'laundry_options_4': [np.nan] * data.shape[0], 'parking_options_1': [np.nan] * data.shape[0], 'parking_options_2': [np.nan] * data.shape[0], 'parking_options_3': [np.nan] * data.shape[0], 'parking_options_4': [np.nan] * data.shape[0], 'parking_options_5': [np.nan] * data.shape[0], 'parking_options_6': [np.nan] * data.shape[0] }) dat = pd.concat([data, df_new], axis=1) for i in range(data.shape[0]): if (dat['laundry_options'][i] == 'w/d in unit'): dat['laundry_options_1'][i] = 0 dat['laundry_options_2'][i] = 0 dat['laundry_options_3'][i] = 0 dat['laundry_options_4'][i] = 1 elif (dat['laundry_options'][i] == 'w/d hookups'): dat['laundry_options_1'][i] = 0 dat['laundry_options_2'][i] = 0 dat['laundry_options_3'][i] = 1 dat['laundry_options_4'][i] = 0 elif (dat['laundry_options'][i] == 'laundry on site'): dat['laundry_options_1'][i] = 1 dat['laundry_options_2'][i] = 0 dat['laundry_options_3'][i] = 0 dat['laundry_options_4'][i] = 0 elif (dat['laundry_options'][i] == 'no laundry on site'): dat['laundry_options_1'][i] = 0 dat['laundry_options_2'][i] = 1 dat['laundry_options_3'][i] = 0 dat['laundry_options_4'][i] = 0 elif (dat['laundry_options'][i] == 'laundry in bldg'): dat['laundry_options_1'][i] = 0 dat['laundry_options_2'][i] = 0 dat['laundry_options_3'][i] = 0 dat['laundry_options_4'][i] = 0 for i in range(data.shape[0]): if (dat['parking_options'][i] == 'carport'): dat['parking_options_1'][i] = 1 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'detached garage'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 1 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'no parking'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 1 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'off-street parking'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 1 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'street parking'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 1 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'valet parking'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 1 elif (dat['parking_options'][i] == 'attached garage'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 dat.drop(['laundry_options', 'parking_options'], axis=1, inplace = True) return dat
# a seperate feature and we will discard the old one. data['MonthOfPurchase'] = pd.DatetimeIndex(data['PurchDate']).month # Dropping: a) Attributes not providing actual information. # b) Attributes with high missing values. # c) Some of the highly correlated attributes. data.drop(["RefId", "PurchDate", "VehYear", "Model", \ "SubModel", "WheelType", \ "PRIMEUNIT", "AUCGUART"], axis = 1, inplace = True) # Imputing categorical columns with most frequent values.. categorical_feature_mask = data.dtypes == object categorical_cols = data.columns[categorical_feature_mask].tolist() catImputer = CategoricalImputer(strategy='most_frequent') for col in categorical_cols: data[col] = catImputer.fit_transform(data[col]) # Imputing numerical columns with median values. Before imputation, # we have to change the datatype from Float64 to int. numerical_cols = data.columns.drop(categorical_cols).tolist() data[numerical_cols] = data[numerical_cols].fillna(-1) data[numerical_cols] = data[numerical_cols].astype(np.int64) data[numerical_cols] = data[numerical_cols].replace(-1, np.nan) imputer = SimpleImputer(missing_values=np.nan, strategy='median') data[numerical_cols] = imputer.fit_transform(data[numerical_cols]) # One-hot encoding categorical data to dummy attributes.. data = pd.get_dummies(data[categorical_cols]) # Standardizing our data, so as to follow normal distribution with # zero mean and unit variance. This primarily helps when applying