Example #1
0
def CleanMissingValue(df_Comics):
    """ Clean missing values """
    # Drop columns "EYE", "GSM" and "HAIR" (containing <25% missing values)
    df_Comics.drop(["EYE", "GSM", "HAIR"], axis=1, inplace=True)
    # Drop "ALIVE" and "YEAR OF FIRST APPEARANCE" missing values (containing <4% missings values)
    df_Comics.dropna(subset=["ALIVE", "YEAR OF FIRST APPEARANCE"], inplace=True)
    # Fill "NUMBER OF APPEARANCES" missing values with mean
    appearancesMean = df_Comics["NUMBER OF APPEARANCES"].mean()
    df_Comics["NUMBER OF APPEARANCES"] = df_Comics["NUMBER OF APPEARANCES"].fillna(appearancesMean)
    # Fill "GENDER", "IDENTITY TYPE" and "TEAM" missing values with most frequent value.
    imputer = CategoricalImputer()
    df_Comics["GENDER"] = imputer.fit_transform(df_Comics["GENDER"])
    df_Comics["IDENTITY TYPE"] = imputer.fit_transform(df_Comics["IDENTITY TYPE"])
    df_Comics["TEAM"] = imputer.fit_transform(df_Comics["TEAM"])
    return df_Comics
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.imputer = CategoricalImputer()
        return self

    def transform(self, X):
        age_cats_imputed = pd.Series(
            self.imputer.fit_transform(X.Age_cats.copy())).astype('category')
        sex_imputed = pd.Series(self.imputer.fit_transform(
            X.Sex.copy())).astype('category')
        embarked_imputed = pd.Series(
            self.imputer.fit_transform(X.Embarked.copy())).astype('category')
        X.Sex = sex_imputed.cat.codes
        X.Embarked = embarked_imputed.cat.codes
        X.Age_cats = age_cats_imputed
        return X
Example #3
0
def clean_impute(df):
	# This function consumes a dataframe and spits out a stripped, imputed dataframe
	data = df
	data = data.fillna(data.mean())
	imputer = CategoricalImputer()
	imp = imputer.fit_transform(data.values)
	return imp
Example #4
0
def preproccesing(data_sampled, pca_components = 5):
    """
    Perform preprocessing of data and tranformationa and return the transformed data
    1. removing NaN values if any
    2. Converting categorical to continuous data variable
    3. Label encoding and then One Hot Encoding
    4. Performs PCA for dimenstions reduction
    """
    features = {  'UniqueID':0, 'disbursed_amount':1,'asset_cost':2,'ltv':3,'branch_id':4,'supplier_id':5,
                  'manufacturer_id':6,'Current_pincode_ID':7,'Date.of.Birth':8,'Employment.Type':9,'DisbursalDate':10,'State_ID':11,
                  'Employee_code_ID':12,'MobileNo_Avl_Flag':13, 'Aadhar_flag':14,'PAN_flag':15,'VoterID_flag':16,
                  'Driving_flag':17, 'Passport_flag':18,'PERFORM_CNS.SCORE':19, 'PERFORM_CNS.SCORE.DESCRIPTION':20,
                  'PRI.NO.OF.ACCTS':21,'PRI.ACTIVE.ACCTS':22,'PRI.OVERDUE.ACCTS':23,'PRI.CURRENT.BALANCE':24,
                  'PRI.SANCTIONED.AMOUNT':25,'PRI.DISBURSED.AMOUNT':26,'SEC.NO.OF.ACCTS':27,'SEC.ACTIVE.ACCTS':28,
                  'SEC.OVERDUE.ACCTS':29, 'SEC.CURRENT.BALANCE':30, 'SEC.SANCTIONED.AMOUNT':31,'SEC.DISBURSED.AMOUNT':32,
                  'PRIMARY.INSTAL.AMT':33, 'SEC.INSTAL.AMT':34, 'NEW.ACCTS.IN.LAST.SIX.MONTHS':35, 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS':36, 
                  'AVERAGE.ACCT.AGE':37,'CREDIT.HISTORY.LENGTH':38, 'NO.OF_INQUIRIES':39,'loan_default':40 }

    # Removing NaN values
    data_sampled = data_sampled.drop(['branch_id','supplier_id', 'manufacturer_id','Current_pincode_ID', 'State_ID', 'Employee_code_ID'], axis = 1)

    from sklearn_pandas import CategoricalImputer
    imputer = CategoricalImputer()
    data_sampled['Employment.Type'] = imputer.fit_transform(data_sampled['Employment.Type'])
    
    # Categorial to continuos

    import functools    # Used for mapping a function with more than one argument 
    data_sampled['Date.of.Birth'] = list(map(Utils.toDate, data_sampled['Date.of.Birth']))
    data_sampled['Date.of.Birth'] = list(map(functools.partial(Utils.date_diff, date2 = datetime.datetime.today().date()), data_sampled['Date.of.Birth']))
    
    data_sampled['DisbursalDate'] = list(map(Utils.toDate, data_sampled['DisbursalDate']))
    data_sampled['DisbursalDate'] = list(map(functools.partial(Utils.date_diff, date2 = datetime.datetime.today().date()), data_sampled['DisbursalDate']))
    
    data_sampled['AVERAGE.ACCT.AGE'] = list(map(Utils.total_span, data_sampled['AVERAGE.ACCT.AGE'] ))   #1yrs 10mon = 1*12 +10 = 22
    data_sampled['CREDIT.HISTORY.LENGTH'] = list(map(Utils.total_span, data_sampled['CREDIT.HISTORY.LENGTH'] ))   #1yrs 10mon = 1*12 +10 = 22

    x = data_sampled.iloc[:,0:34].values
    
    # Label encoding categorical values
    encoder = LabelEncoder()
    x[:,5] = encoder.fit_transform(x[:,5])
    x[:,14] = encoder.fit_transform(x[:,14])

    # Hot encoding new columns
    hot_encoder = OneHotEncoder(categorical_features=[5,14])
    x = hot_encoder.fit_transform(x).toarray()

    #normalizing data (scaling the data)
#    scale = StandardScaler()
#    x = scale.fit_transform(x)

    # feature selection using random forest
    # feature selection using PCA
#    from sklearn.decomposition import PCA
#    pca = PCA(n_components=pca_components)
#    x = pca.fit_transform(x)

    return (x)
Example #5
0
    def encodeCategoricalValues(self,data):
        """
                                                Method Name: encodeCategoricalValues
                                                Description: This method encodes all the categorical values in the training set.
                                                Output: A Dataframe which has all the categorical values encoded.
                                                On Failure: Raise Exception

                                                Written By: Ajinkya Abhang
                                                Version: 1.0
                                                Revisions: None
        """


    # We can impute the categorical values like below:
        features_nan = [feature for feature in data.columns if data[feature].isnull().sum() > 0 and data[feature].dtypes == 'O']

        imputer = CategoricalImputer()

        if len(features_nan) != 0:
            for cat_feature in features_nan:
                data[cat_feature] = imputer.fit_transform(data[cat_feature])

        # We can impute the non-categorical values like below:
        numerical_with_nan = [feature for feature in data.columns if
                            data[feature].isnull().sum() > 1 and data[feature].dtypes != 'O']

        if len(numerical_with_nan) != 0:
            imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan)
            data[numerical_with_nan] = imputer.fit_transform(data[numerical_with_nan])


        # We can use label encoder for encoding
        labelencoder = LabelEncoder()
        dummy_features = ['laundry_options', 'parking_options']

        for feature in dummy_features:
            data[feature] = labelencoder.fit_transform(data[feature])

        for feature in dummy_features:
            data_df = pd.get_dummies(data, columns=['laundry_options', 'parking_options'], drop_first=True)



        return data_df
def imputacion_variable_delegacion(X_train, X_test):
    " Esta funcion imputa la variable 'delegacion_inicio' con la moda "

    #Para el set de entrenamiento
    X = X_train.delegacion_inicio.values.reshape(X_train.shape[0], 1)
    delegacionInicio_imputer = CategoricalImputer(strategy='most_frequent')
    X_train['delegacion_inicio'] = delegacionInicio_imputer.fit_transform(X)

    #Para el set de prueba
    X = X_test.delegacion_inicio.values.reshape(X_test.shape[0], 1)
    X_test['delegacion_inicio'] = delegacionInicio_imputer.transform(X)

    return X_train, X_test
def test_default_fill_value_for_constant_strategy(input_type):
    data = ['a', np.nan, 'b', 'b']

    if input_type == 'pd':
        X = pd.Series(data)
    else:
        X = np.asarray(data, dtype=object)

    imputer = CategoricalImputer(strategy='constant')
    Xt = imputer.fit_transform(X)

    assert imputer.fill_ == '?'
    assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
def test_missing_values_param(input_type):

    data = ['x', 'y', 'a_missing', 'y']

    if input_type == 'pd':
        X = pd.Series(data)
    else:
        X = np.asarray(data, dtype=object)

    imp = CategoricalImputer(missing_values='a_missing')
    Xt = imp.fit_transform(X)

    assert (Xt == np.array(['x', 'y', 'y', 'y'])).all()
Example #9
0
    def fit_transform(self, df: 'dataframe') -> 'dataframe':
        """
        Fill in missing categorical values using most frequent value
        """

        # instantiate CategoricalImputer
        imputer = CategoricalImputer()

        # convert array to dataframe
        df_filled = df.apply(lambda x: imputer.fit_transform(x), axis=0)

        # return filled dataframe
        return df_filled
def test_copy_param(input_type):

    data = ['a', np.nan, 'b', 'a']

    if input_type == 'pd':
        X = pd.Series(data)
    else:
        X = np.asarray(data, dtype=object)

    imp = CategoricalImputer(copy=False)
    Xt = imp.fit_transform(X)

    Xe = np.array(['a', 'a', 'b', 'a'])
    assert (Xt == Xe).all()
    assert (X == Xe).all()
Example #11
0
def impute_categorical(data):
    data = data.replace('?', np.nan)
    cat_cols = data.select_dtypes(include=object)
    #print('Cat Cols Data')
    #pprint(cat_cols)
    cat_col_names = cat_cols.columns.values
    print('Categorical Columns')
    pprint(cat_col_names)
    partial_data = data.drop(columns=cat_col_names)

    from sklearn_pandas import CategoricalImputer
    ci = CategoricalImputer()
    for col in cat_col_names:
        try:
            col_data = ci.fit_transform(cat_cols[col].values)
            partial_data = pd.concat(
                [partial_data,
                 pd.DataFrame(col_data, dtype=object)], axis=1)
            #pprint(partial_data)
        except:
            partial_data = pd.concat([partial_data, cat_cols[col]], axis=1)
    return partial_data
Example #12
0
# ============================================================================='''

data_missing = dataset.isnull().sum()
print(data_missing)

# Numpy array for imputing missing values
X = dataset.iloc[:, :-1].values

# =============================================================================
## Missing Categorical Values
# =============================================================================
from sklearn_pandas import CategoricalImputer

data = np.array(X[:,8], dtype=object)
imputer = CategoricalImputer()
X[:,8] = imputer.fit_transform(data)
dataset['Outlet_Size'] = X[:,8]

# =============================================================================
# # Imputer for numeric values
# =============================================================================

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
X[:, 1:2] = imputer.fit_transform(X[:, 1:2])
dataset['Item_Weight'] = X[:,1:2] 

# Check Values in Item Visibilty
dataset.Item_Visibility.value_counts()
# Replace 0 with NaN
dataset['Item_Visibility'].replace(0.000000, np.nan, inplace=True)
plt.show()

##bivariate
sns.boxplot(x='Interest_Rate', y='Total_Accounts', data=Data)
plt.show()

##Impute the data

Data = pd.concat([TrainData, TestData], axis=0)

from sklearn_pandas import CategoricalImputer
imputer = CategoricalImputer()

# imputing the missing values from the column

Data['Home_Owner'] = imputer.fit_transform(Data['Home_Owner'])
Data['Length_Employed'] = imputer.fit_transform(Data['Length_Employed'])
Data['Months_Since_Deliquency'].fillna(0, inplace=True)
Data['Annual_Income'].fillna(Data['Annual_Income'].mean(), inplace=True)

Data['Debt_amount'] = Data['Debt_To_Income'] * (Data['Annual_Income'] / 12)
Data['Debt_amount'].fillna(Data['Debt_amount'].mean(), inplace=True)

cat_df = Data.select_dtypes(include=['object']).copy()
cat_df.columns

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for column in cat_df.columns:
    cat_df[column] = label_encoder.fit_transform(cat_df[column])
Example #14
0
esd_df = pd.read_csv('/Users/encoreai/Desktop/new1.csv',
                     encoding='iso-8859-1',
                     sep=',',
                     engine='python')

list(esd_df.columns)

esd_df.shape

#Finding out the null / Nan values in the columns:
# for _ in esd_df.columns:
#     print("The number of null values in:{} == {}".format(_, esd_df[_].isnull().sum()))

esd_array = esd_df['Doc_type'].values
imputer = CategoricalImputer()
imputer.fit_transform(esd_array)

esd_df["Error_detail"].fillna("No detail", inplace=True)
#print(esd_df)
esd_df = esd_df.drop(["Doc_type"], axis=1)
esd_df['Doc_type'] = esd_array
esd = esd_df.copy()

encoder_tc = ce.BinaryEncoder(cols=['Ticket_Category'])
df_tc = encoder_tc.fit_transform(esd)

encoder_et = ce.BinaryEncoder(cols=['Error_type'])
df_et = encoder_et.fit_transform(df_tc)

encoder_ed = ce.BinaryEncoder(cols=['Error_detail'])
df_ed = encoder_ed.fit_transform(df_et)
Example #15
0
class Preprocessor:
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object

    def replaceInvalidValuesWithNull(self, data):
        for column in data.columns:
            count = data[column][data[column] == '?'].count()
            if count != 0:
                data[column] = data[column].replace('?', np.NaN)
        return data

    def is_null_present(self, data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns True if null values are present in the DataFrame, False if they are not present and
                                        returns the list of columns for which null values are present.
                                On Failure: Raise Exception

                                Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                        """
        self.logger_object.log(
            self.file_object,
            'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values = []
        self.cols = data.columns
        try:
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i] > 0:
                    self.null_present = True
                    self.cols_with_missing_values.append(self.cols[i])
            if (self.null_present
                ):  # write the logs to see which columns have null values
                self.dataframe_with_null = pd.DataFrame()
                self.dataframe_with_null['columns'] = data.columns
                self.dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                self.dataframe_with_null.to_csv(
                    'preprocessing_data/null_values.csv'
                )  # storing the null column information to file
            self.logger_object.log(
                self.file_object,
                'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class'
            )
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in is_null_present method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Finding missing values failed. Exited the is_null_present method of the Preprocessor class'
            )
            raise Exception()

    def impute_missing_values(self, data, cols_with_missing_values):
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        self.cols_with_missing_values = cols_with_missing_values
        try:
            self.imputer = CategoricalImputer()
            for col in self.cols_with_missing_values:
                self.data[col] = self.imputer.fit_transform(self.data[col])
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.data

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        self.logger_object.log(
            self.file_object,
            'Entered the separate_label_feature method of the Preprocessor class'
        )
        try:
            self.X = data.drop(labels=label_column_name, axis=1)
            self.Y = data[label_column_name]
            self.logger_object.log(
                self.file_object,
                'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class'
            )
            return self.X, self.Y
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class'
            )
            raise Exception()
Example #16
0
def impute_categorical(df, col_name):
    imputer = CategoricalImputer()
    df[col_name] = imputer.fit_transform(df[col_name])
    return df
Example #17
0
def _impute_data(df: pd.DataFrame,
                 categorical_all: bool = False,
                 categorical_subset: list = None) -> pd.DataFrame:
    """Imputes missing numerical or categorical values if the percentage of rows containing NaN's is > 5%.
    Else, returns a dataframe without those rows.
        Usage:
        -------
        dataframe_no_nan = impute_data(dataframe_with_nan)
    """

    # try to infer object types, as this will make calculating numeric columns much easier
    df = df.infer_objects()

    # If there are very few missing values (<= 5%), then just drop those rows and return the DataFrame, as
    # it should be enough for the provided plots
    if df.isna().sum().sum() / df.shape[0] <= 0.05:
        return df.dropna().get_dummies()

    catimpute = CategoricalImputer()

    if categorical_all is True and categorical_subset is not None:
        warnings.warn(
            "categorical_all and subset both specified ... using subset and continuing"
        )
        categorical_all = False

    # Try and make dummies for all categorical columns
    if categorical_all:
        likely_categorical_cols = []
        for col in df.columns:
            if is_likely_categorical(df[col]):
                df[col] = pd.get_dummies(data=df[col])
                df[col] = catimpute.fit_transform(df[col])

                likely_categorical_cols.append(col)
        if len(likely_categorical_cols) > 0:
            # Grammatically correct
            if len(likely_categorical_cols) > 1:
                warnings.warn(
                    "Columns {} are likely categorical, creating dummies. Run with categorical=False (to disable all) or categorical_subset=[column names] to disable warning"
                    .format(likely_categorical_cols))
            else:
                warnings.warn(
                    "Column \"{}\" is likely categorical, creating dummies. Run with categorical=False (to disable all) or categorical_subset=[column names] to disable warning"
                    .format(likely_categorical_cols[0]))

    # Or only make dummies for specified columns
    if categorical_subset is not None:
        for col in categorical_subset:
            # NaN's should be ignored here
            df[col] = pd.get_dummies(data=df[col])
            df[col] = catimpute.fit_transform(df[col])

    df.infer_objects()

    for col in df.columns:
        if df[col].isna().sum() > 0:
            if _is_numeric(df[col]):
                # fill using mean TODO: allow this to be specified
                df[col].fillna(df[col].mean(), inplace=True)
            else:
                warnings.warn(
                    "Column \"{}\" cannot be made numeric, dropping and continuing. If this is incorrect, specify it as categorical or transform to a numeric dtype"
                    .format(col))
                df.drop(col, axis=1, inplace=True)

    return df
Example #18
0
def predict():
  print("__________________________") 

 

  import pandas as pd
  import numpy as np
  import seaborn as  sns
  import matplotlib.pyplot as plt
  from sklearn_pandas import CategoricalImputer
  import os as os
  import category_encoders as ce
  from sklearn.metrics  import confusion_matrix
  from sklearn.model_selection import train_test_split
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.model_selection import GridSearchCV
  from sklearn.metrics import accuracy_score
  from sklearn.metrics import matthews_corrcoef
  from sklearn.externals import joblib
  from sklearn.base import BaseEstimator, TransformerMixin
  from sklearn.pipeline import make_pipeline

  import warnings
  warnings.filterwarnings("ignore")


  esd_df = pd.read_csv('/Users/encoreai/Desktop/new1.csv', encoding='iso-8859-1', sep=',', engine='python')

  list(esd_df.columns)

  esd_df.shape

  #Finding out the null / Nan values in the columns:
  # for _ in esd_df.columns:
  #     print("The number of null values in:{} == {}".format(_, esd_df[_].isnull().sum()))

  esd_array = esd_df['Doc_type'].values
  imputer = CategoricalImputer()
  imputer.fit_transform(esd_array)

  esd_df["Error_detail"].fillna("No detail", inplace = True) 
  #print(esd_df)
  esd_df=esd_df.drop(["Doc_type"],axis=1)
  esd_df['Doc_type'] = esd_array
  esd = esd_df.copy()
          
  encoder_tc = ce.BinaryEncoder(cols=['Ticket_Category'])
  df_tc = encoder_tc.fit_transform(esd)
          
  encoder_et = ce.BinaryEncoder(cols=['Error_type'])
  df_et = encoder_et.fit_transform(df_tc)
          
          
  encoder_ed = ce.BinaryEncoder(cols=['Error_detail'])
  df_ed = encoder_ed.fit_transform(df_et)
          
  encoder_dt = ce.BinaryEncoder(cols=['Doc_type'])
  df_dt = encoder_dt.fit_transform(df_ed)

  #Next step is creating training and testing datasets:

  x=df_dt.drop(['Resolution'],axis='columns')
  x.shape

  y=df_dt['Resolution']
  y.shape

  from sklearn.model_selection import train_test_split

  x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

  # print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
  rf1=RandomForestClassifier(criterion='entropy',n_estimators=100,max_features=3,oob_score=True,bootstrap=True,n_jobs=-1,random_state=1)

  #Model fit
  rf1.fit(x_train,y_train)

  row = x_test.head(1)
  # print(row)

  rf1_pred=rf1.predict(x_test)

  # print(rf1_pred)
  from sklearn.model_selection import GridSearchCV
  from sklearn.metrics import accuracy_score
  from sklearn.metrics import matthews_corrcoef

  # Finding Accracy Score
  # print('Accuracy Score:',accuracy_score(y_test,rf1_pred))

  # Matthews Corealation Coefficient 
  mcc = matthews_corrcoef(y_test,rf1_pred)
  # print('Matthews_corrcoef for Model is:',mcc)

  #Feature importances
  features=df_dt.columns[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22]]
  importances = rf1.feature_importances_
  indices = np.argsort(importances)

  plt.figure(1)
  plt.title('Feature Importances')
  plt.barh(range(len(indices)), importances[indices], color='b', align='center')
  plt.yticks(range(len(indices)), features[indices])
  plt.xlabel('Relative Importance')

  oob_error=1-rf1.oob_score_
  # print(oob_error)    #0.150

  params={
              'criterion':['gini','entropy'],
              'n_estimators':[50],
              'max_features':[2,3,4,5,6,7,8],
          }

  rf_gridcv=GridSearchCV(estimator=rf1,cv=5,param_grid=params,scoring='accuracy')
  rf_grid=rf_gridcv.fit(x_train,y_train)

  # print(rf_gridcv.best_params_)

  y_predrf=rf_gridcv.predict(x_test)

  # print(y_predrf)
  one_row1 = x_test.head(2)
  y_pred_one=rf_gridcv.predict(one_row1)
  # print(y_pred_one)

  x = x_test.head(1)
  # print(x)

  import pickle

  pickle.dump(rf_gridcv, open('model.pkl','wb'))

  model = pickle.load(open('model.pkl','rb'))
  print(model.predict(x))
  print("***********************************************")

	# 'Ticket_Category_0' = 0
	# 'Ticket_Category_1' = 1 
	# 'Ticket_Category_2' = 1
 #    'Ticket_Category_3' = 0
 #    'Error_type_0' = 0
 #    'Error_type_1' = 1 
 #    'Error_type_2' = 0
 #    'Error_type_3' = 0 
 #    'Error_type_4' = 1
 #    'Error_type_5' = 0 
 #    'Error_type_6' = 0
 #    'Error_detail_0' = 0 
 #    'Error_detail_1' = 0 
 #    'Error_detail_2' = 0 
 #    'Error_detail_3' = 0
 #    'Error_detail_4' = 0 
 #    'Error_detail_5' = 1 
 #    'Doc_type_0' = 0 
 #    'Doc_type_1' = 0
 #    'Doc_type_2' = 1 
 #    'Doc_type_3' = 1 
 #    'Doc_type_4' = 1
 # 
  # query = [0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,1]
   
  # prediction = model.predict(query)
  # prediction = jsonify({'prediction': list(prediction)})
  # print(prediction)
  prediction = model.predict(x)
  return jsonify({'prediction': list(prediction)})
Example #19
0
df.describe().transpose() 
df.info()
df.describe(include='O')

#Count missing values
df.isna().sum()

#Drop unwanted column
df=df.drop(['Loan_ID'],axis=1)

#impute 'catagorical varibles' ..impute  gender
df['Gender'].value_counts(dropna=False)  #gives na clunts for gender seperately

from sklearn_pandas import CategoricalImputer
imputer=CategoricalImputer()
df['Gender']=imputer.fit_transform(df['Gender'])

df['Married'].value_counts(dropna=False)
df['Married']=imputer.fit_transform(df['Married'])
df['Dependents'].value_counts(dropna=False)
df['Dependents']=imputer.fit_transform(df['Dependents'])
df['Self_Employed'].value_counts(dropna=False)
df['Self_Employed']=imputer.fit_transform(df['Self_Employed'])
df['Credit_History'].value_counts(dropna=False)
df['Credit_History']=imputer.fit_transform(df['Credit_History'])
df.isna().sum()

#only numeric data impute
#impute loamAmount
df['LoanAmount'].isna().sum()
df['LoanAmount'].describe()
Example #20
0
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean')
carsImputed = imp.fit_transform(dum_cars_miss)

df_carsImputed = pd.DataFrame(carsImputed,
                              columns= dum_cars_miss.columns)

dum_cars_miss.shape
carsImputed.shape
df_carsImputed.shape

# Categorical Imputing
from sklearn_pandas import CategoricalImputer
data = np.array(['a', 'b', 'b', np.nan], dtype=object)
imputer = CategoricalImputer()
imputer.fit_transform(data)

from sklearn_pandas import CategoricalImputer
data = np.array(['a', 'b', 'b', np.nan], dtype=object)
imputer = CategoricalImputer(strategy='constant',fill_value="Baby")
imputer.fit_transform(data)


import numpy as np
milk = pd.read_csv("F:/Python Material/Python Course/Datasets/milk.csv",index_col=0)
milk.head()
np.mean(milk), np.std(milk)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Example #21
0
class Preprocessor:
    """
        This class shall  be used to clean and transform the data before training.

        Written By: iNeuron Intelligence
        Version: 1.0
        Revisions: None

        """
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object

    def remove_columns(self, data, columns):
        """
                Method Name: remove_columns
                Description: This method removes the given columns from a pandas dataframe.
                Output: A pandas DataFrame after removing the specified columns.
                On Failure: Raise Exception

                Written By: iNeuron Intelligence
                Version: 1.0
                Revisions: None

        """
        self.logger_object.log(
            self.file_object,
            'Entered the remove_columns method of the Preprocessor class')
        self.data = data
        self.columns = columns
        try:
            self.useful_data = self.data.drop(
                labels=self.columns,
                axis=1)  # drop the labels specified in the columns
            self.logger_object.log(
                self.file_object,
                'Column removal Successful.Exited the remove_columns method of the Preprocessor class'
            )
            return self.useful_data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in remove_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class'
            )
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        """
                        Method Name: separate_label_feature
                        Description: This method separates the features and a Label Coulmns.
                        Output: Returns two separate Dataframes, one containing features and the other containing Labels .
                        On Failure: Raise Exception

                        Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                """
        self.logger_object.log(
            self.file_object,
            'Entered the separate_label_feature method of the Preprocessor class'
        )
        try:
            self.X = data.drop(
                labels=label_column_name, axis=1
            )  # drop the columns specified and separate the feature columns
            self.Y = data[label_column_name]  # Filter the Label columns
            self.logger_object.log(
                self.file_object,
                'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class'
            )
            return self.X, self.Y
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class'
            )
            raise Exception()

    def dropUnnecessaryColumns(self, data, columnNameList):
        """
                        Method Name: is_null_present
                        Description: This method drops the unwanted columns as discussed in EDA section.

                        Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                                """
        data = data.drop(columnNameList, axis=1)
        return data

    def replaceInvalidValuesWithNull(self, data):
        """
                               Method Name: is_null_present
                               Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA.

                               Written By: iNeuron Intelligence
                               Version: 1.0
                               Revisions: None

                                       """

        for column in data.columns:
            count = data[column][data[column] == '?'].count()
            if count != 0:
                data[column] = data[column].replace('?', np.nan)
        return data

    def is_null_present(self, data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns True if null values are present in the DataFrame, False if they are not present and
                                        returns the list of columns for which null values are present.
                                On Failure: Raise Exception

                                Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                        """
        self.logger_object.log(
            self.file_object,
            'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values = []
        self.cols = data.columns
        try:
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i] > 0:
                    self.null_present = True
                    self.cols_with_missing_values.append(self.cols[i])
            if (self.null_present
                ):  # write the logs to see which columns have null values
                self.dataframe_with_null = pd.DataFrame()
                self.dataframe_with_null['columns'] = data.columns
                self.dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                self.dataframe_with_null.to_csv(
                    'preprocessing_data/null_values.csv'
                )  # storing the null column information to file
            self.logger_object.log(
                self.file_object,
                'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class'
            )
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in is_null_present method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Finding missing values failed. Exited the is_null_present method of the Preprocessor class'
            )
            raise Exception()

    def encodeCategoricalValues(self, data):
        """
                                        Method Name: encodeCategoricalValues
                                        Description: This method encodes all the categorical values in the training set.
                                        Output: A Dataframe which has all the categorical values encoded.
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None
                     """
        data["class"] = data["class"].map({'p': 1, 'e': 2})

        for column in data.drop(['class'], axis=1).columns:
            data = pd.get_dummies(data, columns=[column])

        return data

    def encodeCategoricalValuesPrediction(self, data):
        """
                                               Method Name: encodeCategoricalValuesPrediction
                                               Description: This method encodes all the categorical values in the prediction set.
                                               Output: A Dataframe which has all the categorical values encoded.
                                               On Failure: Raise Exception

                                               Written By: iNeuron Intelligence
                                               Version: 1.0
                                               Revisions: None
                            """

        for column in data.columns:
            data = pd.get_dummies(data, columns=[column])

        return data

    # def handleImbalanceDataset(self,X,Y):
    #     """
    #                                                   Method Name: handleImbalanceDataset
    #                                                   Description: This method handles the imbalance in the dataset by oversampling.
    #                                                   Output: A Dataframe which is balanced now.
    #                                                   On Failure: Raise Exception
    #
    #                                                   Written By: iNeuron Intelligence
    #                                                   Version: 1.0
    #                                                   Revisions: None
    #                                """
    #
    #
    #
    #     rdsmple = RandomOverSampler()
    #     x_sampled, y_sampled = rdsmple.fit_sample(X, Y)
    #
    #     return x_sampled,y_sampled

    def impute_missing_values(self, data, cols_with_missing_values):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None
                     """
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        self.cols_with_missing_values = cols_with_missing_values
        try:
            self.imputer = CategoricalImputer()
            for col in self.cols_with_missing_values:
                self.data[col] = self.imputer.fit_transform(self.data[col])
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()

    def get_columns_with_zero_std_deviation(self, data):
        """
                                                Method Name: get_columns_with_zero_std_deviation
                                                Description: This method finds out the columns which have a standard deviation of zero.
                                                Output: List of the columns with standard deviation of zero
                                                On Failure: Raise Exception

                                                Written By: iNeuron Intelligence
                                                Version: 1.0
                                                Revisions: None
                             """
        self.logger_object.log(
            self.file_object,
            'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class'
        )
        self.columns = data.columns
        self.data_n = data.describe()
        self.col_to_drop = []
        try:
            for x in self.columns:
                if (self.data_n[x]['std'] == 0
                    ):  # check if standard deviation is zero
                    self.col_to_drop.append(
                        x
                    )  # prepare the list of columns with standard deviation zero
            self.logger_object.log(
                self.file_object,
                'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class'
            )
            return self.col_to_drop

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class'
            )
            raise Exception()
Example #22
0
class Preprocessor:
    """
        This class shall  be used to clean and transform the data before training.

        """
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object

    def remove_unwanted_spaces(self, data):
        """
                        Method Name: remove_unwanted_spaces
                        Description: This method removes the unwanted spaces from a pandas dataframe.
                        Output: A pandas DataFrame after removing the spaces.
                        On Failure: Raise Exception

                """
        self.logger_object.log(
            self.file_object,
            'Entered the remove_unwanted_spaces method of the Preprocessor class'
        )
        self.data = data

        try:
            self.df_without_spaces = self.data.apply(
                lambda x: x.str.strip() if x.dtype == "object" else x
            )  # drop the labels specified in the columns
            self.logger_object.log(
                self.file_object,
                'Unwanted spaces removal Successful.Exited the remove_unwanted_spaces method of the Preprocessor class'
            )
            return self.df_without_spaces
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in remove_unwanted_spaces method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'unwanted space removal Unsuccessful. Exited the remove_unwanted_spaces method of the Preprocessor class'
            )
            raise Exception()

    def remove_columns(self, data, columns):
        """
                Method Name: remove_columns
                Description: This method removes the given columns from a pandas dataframe.
                Output: A pandas DataFrame after removing the specified columns.
                On Failure: Raise Exception
        """
        self.logger_object.log(
            self.file_object,
            'Entered the remove_columns method of the Preprocessor class')
        self.data = data
        self.columns = columns
        try:
            self.useful_data = self.data.drop(
                labels=self.columns,
                axis=1)  # drop the labels specified in the columns
            self.logger_object.log(
                self.file_object,
                'Column removal Successful.Exited the remove_columns method of the Preprocessor class'
            )
            return self.useful_data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in remove_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class'
            )
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        """
                        Method Name: separate_label_feature
                        Description: This method separates the features and a Label Coulmns.
                        Output: Returns two separate Dataframes, one containing features and the other containing Labels .
                        On Failure: Raise Exception
                """
        self.logger_object.log(
            self.file_object,
            'Entered the separate_label_feature method of the Preprocessor class'
        )
        try:
            self.X = data.drop(
                labels=label_column_name, axis=1
            )  # drop the columns specified and separate the feature columns
            self.Y = data[label_column_name]  # Filter the Label columns
            self.logger_object.log(
                self.file_object,
                'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class'
            )
            return self.X, self.Y
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class'
            )
            raise Exception()

    def is_null_present(self, data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns True if null values are present in the DataFrame, False if they are not present and
                                        returns the list of columns for which null values are present.
                                On Failure: Raise Exception

                        """
        self.logger_object.log(
            self.file_object,
            'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values = []
        self.cols = data.columns
        try:
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i] > 0:
                    self.null_present = True
                    self.cols_with_missing_values.append(self.cols[i])
            if (self.null_present
                ):  # write the logs to see which columns have null values
                self.dataframe_with_null = pd.DataFrame()
                self.dataframe_with_null['columns'] = data.columns
                self.dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                self.dataframe_with_null.to_csv(
                    'preprocessing_data/null_values.csv'
                )  # storing the null column information to file
            self.logger_object.log(
                self.file_object,
                'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class'
            )
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in is_null_present method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Finding missing values failed. Exited the is_null_present method of the Preprocessor class'
            )
            raise Exception()

    def impute_missing_values(self, data, cols_with_missing_values):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception
                     """
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        self.cols_with_missing_values = cols_with_missing_values
        try:
            self.imputer = CategoricalImputer()
            for col in self.cols_with_missing_values:
                self.data[col] = self.imputer.fit_transform(self.data[col])
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()

    def scale_numerical_columns(self, data):
        """
                                                        Method Name: scale_numerical_columns
                                                        Description: This method scales the numerical values using the Standard scaler.
                                                        Output: A dataframe with scaled
                                                        On Failure: Raise Exception

                                     """
        self.logger_object.log(
            self.file_object,
            'Entered the scale_numerical_columns method of the Preprocessor class'
        )

        self.data = data

        try:
            self.num_df = self.data.select_dtypes(include=['int64']).copy()
            self.scaler = StandardScaler()
            self.scaled_data = self.scaler.fit_transform(self.num_df)
            self.scaled_num_df = pd.DataFrame(data=self.scaled_data,
                                              columns=self.num_df.columns)

            self.logger_object.log(
                self.file_object,
                'scaling for numerical values successful. Exited the scale_numerical_columns method of the Preprocessor class'
            )
            return self.scaled_num_df

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in scale_numerical_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'scaling for numerical columns Failed. Exited the scale_numerical_columns method of the Preprocessor class'
            )
            raise Exception()

    def encode_categorical_columns(self, data):
        """
                                                Method Name: encode_categorical_columns
                                                Description: This method encodes the categorical values to numeric values.
                                                Output: only the columns with categorical values converted to numerical values
                                                On Failure: Raise Exception
                             """
        self.logger_object.log(
            self.file_object,
            'Entered the encode_categorical_columns method of the Preprocessor class'
        )

        try:
            self.cat_df = data.select_dtypes(include=['object']).copy()
            # Using the dummy encoding to encode the categorical columns to numericsl ones
            for col in self.cat_df.columns:
                self.cat_df = pd.get_dummies(self.cat_df,
                                             columns=[col],
                                             prefix=[col],
                                             drop_first=True)

            self.logger_object.log(
                self.file_object,
                'encoding for categorical values successful. Exited the encode_categorical_columns method of the Preprocessor class'
            )
            return self.cat_df

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in encode_categorical_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'encoding for categorical columns Failed. Exited the encode_categorical_columns method of the Preprocessor class'
            )
            raise Exception()

    def handle_imbalanced_dataset(self, x, y):
        """
        Method Name: handle_imbalanced_dataset
        Description: This method handles the imbalanced dataset to make it a balanced one.
        Output: new balanced feature and target columns
        On Failure: Raise Exception
                                     """
        self.logger_object.log(
            self.file_object,
            'Entered the handle_imbalanced_dataset method of the Preprocessor class'
        )

        try:
            self.rdsmple = RandomOverSampler()
            self.x_sampled, self.y_sampled = self.rdsmple.fit_sample(x, y)
            self.logger_object.log(
                self.file_object,
                'dataset balancing successful. Exited the handle_imbalanced_dataset method of the Preprocessor class'
            )
            return self.x_sampled, self.y_sampled

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in handle_imbalanced_dataset method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'dataset balancing Failed. Exited the handle_imbalanced_dataset method of the Preprocessor class'
            )
            raise Exception()
"""Deal With Missing Data
The missingno library provides a neat way to showcase which variables have
missing data. This is done below using a bar chart. I will then proceed to use
Pandas fillna method to fill the two columns that have missing data (Item_Weight, Outlet_Size)
"""
msno.bar(train_data)
msno.bar(test_data)

train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean(),
                                 inplace=True)
test_data['Item_Weight'].fillna(test_data['Item_Weight'].mean(), inplace=True)

outlet_size_tr = train_data['Outlet_Size']
outlet_size_ts = test_data['Outlet_Size']
imputer1 = CategoricalImputer()
outlet_size_tr = imputer1.fit_transform(outlet_size_tr)
outlet_size_ts = imputer1.fit_transform(outlet_size_ts)

train_data = train_data.drop(['Outlet_Size'], axis=1)
train_data.insert(8, 'Outlet_Size', outlet_size_tr)

test_data = test_data.drop(['Outlet_Size'], axis=1)
test_data.insert(8, 'Outlet_Size', outlet_size_ts)

# Let's see if there are any columns we can drop

cor = train_data.corr()
cor["Item_Outlet_Sales"].sort_values(ascending=False)

# The year that an outlet was established has a very low correlation figure
Example #24
0
def fill_empty(frame):
    imputer = CategoricalImputer()

    return frame.apply(lambda x: imputer.fit_transform(x), axis=0)
Example #25
0
sns.heatmap(df_vis.isnull(), cbar=False)
df_vis.isnull().sum()
df_vis.isna().sum()

df_vis['HandsetPrice'] = df_vis['HandsetPrice'].replace('Unknown', -1)
df_vis['HandsetPrice']
df_vis['HandsetPrice'] = df_vis['HandsetPrice'].astype('int')
df_vis['HandsetPrice'] = df_vis['HandsetPrice'].replace(-1, np.NaN)
df_vis['HandsetPrice']
df_vis.Churn.value_counts()

#Missing values imputation
temp = df_vis
temp = temp.fillna(temp.mean())
imputer = CategoricalImputer(missing_values='NaN', strategy='most_frequent')
imputer.fit_transform(temp['ServiceArea'])
temp = temp.apply(lambda x: x.fillna(x.value_counts().index[0]))
temp.isna().sum()

#Statistical analysis
nr, nc = temp.shape
for j in range(nc):
    if ((temp.iloc[:, j].dtype != np.int64) &
        (temp.iloc[:, j].dtype != np.number)):
        xx = temp.iloc[:, j]
        yy = temp['Churn']
        ct = pd.crosstab(xx, yy)
        ch = chi2_contingency(ct)
        print('Chisquare result for', temp.columns[j], 'is  ', ch)
    else:
        print('Not a category')
def doprediction():
    info = request.data
    json_data = json.loads(info)
    meldrange = json_data["meldrange"]
    meldrange = float(meldrange)
    donor_data = json_data["donor"]
    dolen = len(donor_data)
    allrecip_data = json_data["allrecip"]
    allrecip_len = len(allrecip_data)
    donor_df = pd.DataFrame(data=donor_data[1:dolen], columns=donor_data[0])
    allrecip_df = pd.DataFrame(data=allrecip_data[1:allrecip_len],
                               columns=allrecip_data[0])

    filename = 'datafile/donorfile.csv'
    filename2 = 'datafile/recipfile.csv'
    silentremove(filename)
    silentremove(filename2)
    donor_df.to_csv(filename, encoding='utf-8')
    allrecip_df.to_csv(filename2, encoding='utf-8')
    # start to impute --------------------------------------

    donor_df = pd.read_csv('datafile/donorfile.csv', index_col=0)
    recipient_df = pd.read_csv('datafile/recipfile.csv', index_col=0)
    id_df = pd.DataFrame(
        recipient_df[['recipient_id', 'FINAL_MELD_PELD_LAB_SCORE']])
    X_cf_r = recipient_df.select_dtypes(include=['object'])
    X_ncf_r = recipient_df.select_dtypes(exclude=['object'])

    X_cf_d = donor_df.select_dtypes(include=['object'])
    X_ncf_d = donor_df.select_dtypes(exclude=['object'])

    imp_cat = CategoricalImputer()
    X_cf_r = pd.DataFrame(imp_cat.fit_transform(np.array(X_cf_r)),
                          columns=X_cf_r.columns)

    imp_cat = CategoricalImputer()
    X_cf_d = pd.DataFrame(imp_cat.fit_transform(np.array(X_cf_d)),
                          columns=X_cf_d.columns)

    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X_ncf_r)
    X_ncf_r = pd.DataFrame(imp.transform(X_ncf_r), columns=X_ncf_r.columns)

    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X_ncf_d)
    X_ncf_d = pd.DataFrame(imp.transform(X_ncf_d), columns=X_ncf_d.columns)

    recipient_df = pd.merge(X_ncf_r, X_cf_r, left_index=True, right_index=True)
    # donor_df = pd.merge(X_ncf_d, X_cf_d, left_index=True, right_index=True)

    if meldrange != 200:
        id_df = id_df.loc[(id_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & (
            id_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20)]
        recipient_df = recipient_df.loc[
            (recipient_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange)
            & (recipient_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20.0)]

    X_cf_r = recipient_df.select_dtypes(include=['object'])

    X_ncf_r = recipient_df.select_dtypes(exclude=['object'])

    min_max_scaler = preprocessing.MinMaxScaler()
    header = X_ncf_d.columns
    X_ncf_d = min_max_scaler.fit_transform(X_ncf_d)
    X_ncf_d = pd.DataFrame(X_ncf_d, columns=header)

    min_max_scaler = preprocessing.MinMaxScaler()
    header = X_ncf_r.columns
    X_ncf_r = min_max_scaler.fit_transform(X_ncf_r)

    X_ncf_r = pd.DataFrame(X_ncf_r, columns=header)
    X_ncf_r.index = X_cf_r.index
    recipient_df = pd.merge(X_ncf_r, X_cf_r, left_index=True, right_index=True)
    print("recipdf", recipient_df)
    donor_df = pd.merge(X_ncf_d, X_cf_d, left_index=True, right_index=True)

    filename = 'datafile/donorfile.csv'
    filename2 = 'datafile/recipfile.csv'
    filename3 = 'datafile/recipidfile.csv'
    silentremove(filename)
    silentremove(filename2)
    silentremove(filename3)
    donor_df.to_csv(filename, encoding='utf-8')

    print("meldrange", meldrange)
    # if meldrange!=200:
    #     id_df = id_df.loc[(id_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & (id_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20)]
    #     recipient_df = recipient_df.loc[(recipient_df['FINAL_MELD_PELD_LAB_SCORE']<meldrange) & (recipient_df['FINAL_MELD_PELD_LAB_SCORE']>= meldrange-20.0)]

    id_df = pd.DataFrame(id_df['recipient_id'], columns=['recipient_id'])

    recipient_df.to_csv(filename2, encoding='utf-8')
    id_df.to_csv(filename3, encoding='utf-8')

    import prediction
    match_score = prediction.matching()
    predict_score = prediction.predictscore()
    return json.dumps({'match': match_score, 'predict': predict_score})
class Preprocessor:
    """
        This class shall  be used to clean and transform the data before training.

        """
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object

    def remove_unwanted_spaces(self, data):
        """
                        Method Name: remove_unwanted_spaces
                        Description: This method removes the unwanted spaces from a pandas dataframe.
                        Output: A pandas DataFrame after removing the spaces.
                        On Failure: Raise Exception

                """
        self.logger_object.log(
            self.file_object,
            'Entered the remove_unwanted_spaces method of the Preprocessor class'
        )
        self.data = data

        try:
            self.df_without_spaces = self.data.apply(
                lambda x: x.str.strip() if x.dtype == "object" else x
            )  # drop the labels specified in the columns
            self.logger_object.log(
                self.file_object,
                'Unwanted spaces removal Successful.Exited the remove_unwanted_spaces method of the Preprocessor class'
            )
            return self.df_without_spaces
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in remove_unwanted_spaces method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'unwanted space removal Unsuccessful. Exited the remove_unwanted_spaces method of the Preprocessor class'
            )
            raise Exception()

    def remove_columns(self, data, columns):
        """
                Method Name: remove_columns
                Description: This method removes the given columns from a pandas dataframe.
                Output: A pandas DataFrame after removing the specified columns.
                On Failure: Raise Exception

        """
        self.logger_object.log(
            self.file_object,
            'Entered the remove_columns method of the Preprocessor class')
        self.data = data
        self.columns = columns
        try:
            self.useful_data = self.data.drop(
                labels=self.columns,
                axis=1)  # drop the labels specified in the columns
            self.logger_object.log(
                self.file_object,
                'Column removal Successful.Exited the remove_columns method of the Preprocessor class'
            )
            return self.useful_data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in remove_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class'
            )
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        """
                        Method Name: separate_label_feature
                        Description: This method separates the features and a Label Coulmns.
                        Output: Returns two separate Dataframes, one containing features and the other containing Labels .
                        On Failure: Raise Exception

                """
        self.logger_object.log(
            self.file_object,
            'Entered the separate_label_feature method of the Preprocessor class'
        )
        try:
            self.X = data.drop(
                labels=label_column_name, axis=1
            )  # drop the columns specified and separate the feature columns
            self.Y = data[label_column_name]  # Filter the Label columns
            self.logger_object.log(
                self.file_object,
                'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class'
            )
            return self.X, self.Y
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class'
            )
            raise Exception()

    def is_null_present(self, data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns True if null values are present in the DataFrame, False if they are not present and
                                        returns the list of columns for which null values are present.
                                On Failure: Raise Exception

                        """
        self.logger_object.log(
            self.file_object,
            'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values = []
        self.cols = data.columns
        try:
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i] > 0:
                    self.null_present = True
                    self.cols_with_missing_values.append(self.cols[i])
            if (self.null_present
                ):  # write the logs to see which columns have null values
                self.dataframe_with_null = pd.DataFrame()
                self.dataframe_with_null['columns'] = data.columns
                self.dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                self.dataframe_with_null.to_csv(
                    'preprocessing_data/null_values.csv'
                )  # storing the null column information to file
            self.logger_object.log(
                self.file_object,
                'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class'
            )
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in is_null_present method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Finding missing values failed. Exited the is_null_present method of the Preprocessor class'
            )
            raise Exception()

    def impute_missing_values(self, data, cols_with_missing_values):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                     """
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        self.cols_with_missing_values = cols_with_missing_values
        try:
            self.imputer = CategoricalImputer()
            for col in self.cols_with_missing_values:
                self.data[col] = self.imputer.fit_transform(self.data[col])
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()

    def scale_numerical_columns(self, data):
        """
                                                        Method Name: scale_numerical_columns
                                                        Description: This method scales the numerical values using the Standard scaler.
                                                        Output: A dataframe with scaled values
                                                        On Failure: Raise Exception

                                     """
        self.logger_object.log(
            self.file_object,
            'Entered the scale_numerical_columns method of the Preprocessor class'
        )

        self.data = data
        self.num_df = self.data[[
            'months_as_customer', 'policy_deductable', 'umbrella_limit',
            'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
            'number_of_vehicles_involved', 'bodily_injuries', 'witnesses',
            'injury_claim', 'property_claim', 'vehicle_claim'
        ]]

        try:

            self.scaler = StandardScaler()
            self.scaled_data = self.scaler.fit_transform(self.num_df)
            self.scaled_num_df = pd.DataFrame(data=self.scaled_data,
                                              columns=self.num_df.columns,
                                              index=self.data.index)
            self.data.drop(columns=self.scaled_num_df.columns, inplace=True)
            self.data = pd.concat([self.scaled_num_df, self.data], axis=1)

            self.logger_object.log(
                self.file_object,
                'scaling for numerical values successful. Exited the scale_numerical_columns method of the Preprocessor class'
            )
            return self.data

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in scale_numerical_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'scaling for numerical columns Failed. Exited the scale_numerical_columns method of the Preprocessor class'
            )
            raise Exception()

    def encode_categorical_columns(self, data):
        """
                                                Method Name: encode_categorical_columns
                                                Description: This method encodes the categorical values to numeric values.
                                                Output: dataframe with categorical values converted to numerical values
                                                On Failure: Raise Exception

                             """
        self.logger_object.log(
            self.file_object,
            'Entered the encode_categorical_columns method of the Preprocessor class'
        )

        self.data = data
        try:
            self.cat_df = self.data.select_dtypes(include=['object']).copy()
            self.cat_df['policy_csl'] = self.cat_df['policy_csl'].map({
                '100/300':
                1,
                '250/500':
                2.5,
                '500/1000':
                5
            })
            self.cat_df['insured_education_level'] = self.cat_df[
                'insured_education_level'].map({
                    'JD': 1,
                    'High School': 2,
                    'College': 3,
                    'Masters': 4,
                    'Associate': 5,
                    'MD': 6,
                    'PhD': 7
                })
            self.cat_df['incident_severity'] = self.cat_df[
                'incident_severity'].map({
                    'Trivial Damage': 1,
                    'Minor Damage': 2,
                    'Major Damage': 3,
                    'Total Loss': 4
                })
            self.cat_df['insured_sex'] = self.cat_df['insured_sex'].map({
                'FEMALE':
                0,
                'MALE':
                1
            })
            self.cat_df['property_damage'] = self.cat_df[
                'property_damage'].map({
                    'NO': 0,
                    'YES': 1
                })
            self.cat_df['police_report_available'] = self.cat_df[
                'police_report_available'].map({
                    'NO': 0,
                    'YES': 1
                })
            try:
                # code block for training
                self.cat_df['fraud_reported'] = self.cat_df[
                    'fraud_reported'].map({
                        'N': 0,
                        'Y': 1
                    })
                self.cols_to_drop = [
                    'policy_csl', 'insured_education_level',
                    'incident_severity', 'insured_sex', 'property_damage',
                    'police_report_available', 'fraud_reported'
                ]
            except:
                # code block for Prediction
                self.cols_to_drop = [
                    'policy_csl', 'insured_education_level',
                    'incident_severity', 'insured_sex', 'property_damage',
                    'police_report_available'
                ]
            # Using the dummy encoding to encode the categorical columns to numerical ones

            for col in self.cat_df.drop(columns=self.cols_to_drop).columns:
                self.cat_df = pd.get_dummies(self.cat_df,
                                             columns=[col],
                                             prefix=[col],
                                             drop_first=True)

            self.data.drop(
                columns=self.data.select_dtypes(include=['object']).columns,
                inplace=True)
            self.data = pd.concat([self.cat_df, self.data], axis=1)
            self.logger_object.log(
                self.file_object,
                'encoding for categorical values successful. Exited the encode_categorical_columns method of the Preprocessor class'
            )
            return self.data

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in encode_categorical_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'encoding for categorical columns Failed. Exited the encode_categorical_columns method of the Preprocessor class'
            )
            raise Exception()

    def handle_imbalanced_dataset(self, x, y):
        """
        Method Name: handle_imbalanced_dataset
        Description: This method handles the imbalanced dataset to make it a balanced one.
        Output: new balanced feature and target columns
        On Failure: Raise Exception

                                     """
        self.logger_object.log(
            self.file_object,
            'Entered the handle_imbalanced_dataset method of the Preprocessor class'
        )

        try:
            self.rdsmple = RandomOverSampler()
            self.x_sampled, self.y_sampled = self.rdsmple.fit_sample(x, y)
            self.logger_object.log(
                self.file_object,
                'dataset balancing successful. Exited the handle_imbalanced_dataset method of the Preprocessor class'
            )
            return self.x_sampled, self.y_sampled

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in handle_imbalanced_dataset method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'dataset balancing Failed. Exited the handle_imbalanced_dataset method of the Preprocessor class'
            )
            raise Exception()
 def transform(self, X):
     for var in config.CAT_FEATURES:
         imputer = CategoricalImputer()
         X[var] = imputer.fit_transform(X[var])
     return X
Example #29
0
    def encodeCategoricalValuesPrediction(self,data):
        """
                                               Method Name: encodeCategoricalValuesPrediction
                                               Description: This method encodes all the categorical values in the prediction set.
                                               Output: A Dataframe which has all the categorical values encoded.
                                               On Failure: Raise Exception

                                               Written By: Ajinkya Abhang
                                               Version: 1.0
                                               Revisions: None
                            """

        # We can impute the categorical values like below:
        features_nan = [feature for feature in data.columns if
                        data[feature].isnull().sum() > 0 and data[feature].dtypes == 'O']

        imputer = CategoricalImputer()

        if len(features_nan) != 0:
            for cat_feature in features_nan:
                data[cat_feature] = imputer.fit_transform(data[cat_feature])

        # We can impute the non-categorical values like below:
        numerical_with_nan = [feature for feature in data.columns if
                              data[feature].isnull().sum() > 1 and data[feature].dtypes != 'O']

        if len(numerical_with_nan) != 0:
            imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan)
            data[numerical_with_nan] = imputer.fit_transform(data[numerical_with_nan])

        # We can use label encoder for encoding
        df_new = pd.DataFrame({
            'laundry_options_1': [np.nan] * data.shape[0],
            'laundry_options_2': [np.nan] * data.shape[0],
            'laundry_options_3': [np.nan] * data.shape[0],
            'laundry_options_4': [np.nan] * data.shape[0],
            'parking_options_1': [np.nan] * data.shape[0],
            'parking_options_2': [np.nan] * data.shape[0],
            'parking_options_3': [np.nan] * data.shape[0],
            'parking_options_4': [np.nan] * data.shape[0],
            'parking_options_5': [np.nan] * data.shape[0],
            'parking_options_6': [np.nan] * data.shape[0]
        })

        dat = pd.concat([data, df_new], axis=1)

        for i in range(data.shape[0]):
            if (dat['laundry_options'][i] == 'w/d in unit'):
                dat['laundry_options_1'][i] = 0
                dat['laundry_options_2'][i] = 0
                dat['laundry_options_3'][i] = 0
                dat['laundry_options_4'][i] = 1
            elif (dat['laundry_options'][i] == 'w/d hookups'):
                dat['laundry_options_1'][i] = 0
                dat['laundry_options_2'][i] = 0
                dat['laundry_options_3'][i] = 1
                dat['laundry_options_4'][i] = 0
            elif (dat['laundry_options'][i] == 'laundry on site'):
                dat['laundry_options_1'][i] = 1
                dat['laundry_options_2'][i] = 0
                dat['laundry_options_3'][i] = 0
                dat['laundry_options_4'][i] = 0
            elif (dat['laundry_options'][i] == 'no laundry on site'):
                dat['laundry_options_1'][i] = 0
                dat['laundry_options_2'][i] = 1
                dat['laundry_options_3'][i] = 0
                dat['laundry_options_4'][i] = 0
            elif (dat['laundry_options'][i] == 'laundry in bldg'):
                dat['laundry_options_1'][i] = 0
                dat['laundry_options_2'][i] = 0
                dat['laundry_options_3'][i] = 0
                dat['laundry_options_4'][i] = 0

        for i in range(data.shape[0]):
            if (dat['parking_options'][i] == 'carport'):
                dat['parking_options_1'][i] = 1
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'detached garage'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 1
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'no parking'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 1
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'off-street parking'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 1
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'street parking'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 1
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'valet parking'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 1
            elif (dat['parking_options'][i] == 'attached garage'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0

        dat.drop(['laundry_options', 'parking_options'], axis=1, inplace = True)

        return dat
Example #30
0
# a seperate feature and we will discard the old one.
data['MonthOfPurchase'] = pd.DatetimeIndex(data['PurchDate']).month

# Dropping: a) Attributes not providing actual information.
#           b) Attributes with high missing values.
#           c) Some of the highly correlated attributes.
data.drop(["RefId", "PurchDate", "VehYear", "Model", \
  "SubModel", "WheelType", \
  "PRIMEUNIT", "AUCGUART"], axis = 1, inplace = True)

# Imputing categorical columns with most frequent values..
categorical_feature_mask = data.dtypes == object
categorical_cols = data.columns[categorical_feature_mask].tolist()
catImputer = CategoricalImputer(strategy='most_frequent')
for col in categorical_cols:
    data[col] = catImputer.fit_transform(data[col])

# Imputing numerical columns with median values. Before imputation,
# we have to change the datatype from Float64 to int.
numerical_cols = data.columns.drop(categorical_cols).tolist()
data[numerical_cols] = data[numerical_cols].fillna(-1)
data[numerical_cols] = data[numerical_cols].astype(np.int64)
data[numerical_cols] = data[numerical_cols].replace(-1, np.nan)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# One-hot encoding categorical data to dummy attributes..
data = pd.get_dummies(data[categorical_cols])

# Standardizing our data, so as to follow normal distribution with
# zero mean and unit variance. This primarily helps when applying