def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        logger.debug(f"Running {__name__}")

        # determine columns to operate on
        cols = distil_utils.get_operating_columns(
            inputs, self.hyperparams["use_columns"], CATEGORICALS)

        logger.debug(f"Found {len(cols)} categorical columns to evaluate")

        if len(cols) is 0:
            return base.CallResult(inputs)

        imputer = CategoricalImputer(
            strategy=self.hyperparams["strategy"],
            fill_value=self.hyperparams["fill_value"],
            missing_values="",
            tie_breaking="first",
        )
        outputs = inputs.copy()
        failures: List[int] = []
        for c in cols:
            input_col = inputs.iloc[:, c]
            try:
                imputer.fit(input_col)
                result = imputer.transform(input_col)
                outputs.iloc[:, c] = result
            except ValueError as e:
                # value error gets thrown when all data is missing
                if not self.hyperparams["error_on_empty"]:
                    failures.append(c)
                else:
                    raise e

        # for columns that failed using 'most_frequent' try again using 'constant'
        if not self.hyperparams["error_on_empty"]:
            imputer = CategoricalImputer(
                strategy="constant",
                fill_value=self.hyperparams["fill_value"],
                missing_values="",
                tie_breaking="first",
            )
            for f in failures:
                outputs_col = outputs.iloc[:, f]
                imputer.fit(outputs_col)
                result = imputer.transform(outputs_col)
                outputs.iloc[:, f] = result

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Example #2
0
titanic_test = pd.read_csv("titanic_test.csv")
titanic_test.shape

titanic_all = pd.concat([titanic_train, titanic_test])
titanic_all.shape
titanic_all.info()

#impute missing values for continuous features
imputable_cont_features = ['Age','Fare']
cont_imputer = preprocessing.Imputer()
cont_imputer.fit(titanic_all[imputable_cont_features])
titanic_all[imputable_cont_features] = cont_imputer.transform(titanic_all[imputable_cont_features])

#impute missing values for categorical features
cat_imputer = CategoricalImputer()
cat_imputer.fit(titanic_all['Embarked'])
titanic_all['Embarked'] = cat_imputer.transform(titanic_all['Embarked'])

titanic_all['FamilySize'] = titanic_all['SibSp'] +  titanic_all['Parch'] + 1

def convert_family_size(size):
    if(size == 1): 
        return 'Single'
    elif(size <=3): 
        return 'Small'
    elif(size <= 6): 
        return 'Medium'
    else: 
        return 'Large'
titanic_all['FamilyCategory'] = titanic_all['FamilySize'].map(convert_family_size)
titanic_train = pd.read_csv(
    "C:/Users/tauseef.ur.rahman/Desktop/Python-Docs/Titanic/train.csv")
print(titanic_train.info())

#preprocessing stage
#impute missing values for continuous features
imputable_cont_features = ['Age']
cont_imputer = preprocessing.Imputer()
cont_imputer.fit(titanic_train[imputable_cont_features])
print(cont_imputer.statistics_)
titanic_train[imputable_cont_features] = cont_imputer.transform(
    titanic_train[imputable_cont_features])

#impute missing values for categorical features
cat_imputer = CategoricalImputer()
cat_imputer.fit(titanic_train['Embarked'])
print(cat_imputer.fill_)
titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked'])

le_embarked = preprocessing.LabelEncoder()
le_embarked.fit(titanic_train['Embarked'])
print(le_embarked.classes_)
titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked'])

le_sex = preprocessing.LabelEncoder()
le_sex.fit(titanic_train['Sex'])
print(le_sex.classes_)
titanic_train['Sex'] = le_sex.transform(titanic_train['Sex'])

features = ['Pclass', 'Parch', 'SibSp', 'Age', 'Fare', 'Embarked', 'Sex']
X_train = titanic_train[features]
Example #4
0
        #return Weight
print ('Orignal #missing: %d'%sum(data['Item_Weight'].isnull()))
#data['Item_Weight'] = data[['Item_Weight','Item_Identifier']].apply(impute_weight,axis=1).astype(float)
imputer= Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer = imputer.fit(data[['Item_Weight']])
print(data[['Item_Weight']])
data[['Item_Weight']] = imputer.transform(data[['Item_Weight']])

print ('Final #missing: %d'%sum(data['Item_Weight'].isnull()))
       
#Imputing Outlet_Size missing values with the mode
#Determining the mode

outlet_size_mode= data.pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc=lambda x:x.mode())
imputer1= CategoricalImputer(missing_values='NaN',strategy='most_frequent')
imputer1 = imputer1.fit(data['Outlet_Size'])
print(data['Outlet_Size'])
data[['Outlet_Size']] = imputer1.transform(data[['Outlet_Size']])
print(data['Outlet_Size'])

#Feature Engineering
#checking whether we should combine Outlet_Type or not

data.pivot_table(values='Item_Outlet_Sales',columns='Outlet_Type')

#values are significantly different so leave these

#Considering 0 item_visibility as missing we should impute missing values for these data
print(data[['Item_Visibility']])
#print ('Final #zeros: %d'%sum(data['Item_Visibility'] == 0))
data['Item_Visibility'].replace([data['Item_Visibility']==0],np.NaN)
Example #5
0
#Import dataset
df_train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')

#splitting dataset into training and test set
X_train = df_train.iloc[:, 1:-1].values
y_train = df_train.iloc[:, 12].values
X_test = df_test.iloc[:, 1:].values

#Missing values
#--------------training set---------
from sklearn_pandas import CategoricalImputer

imputer_train_cat = CategoricalImputer()
imputer_train_cat = imputer_train_cat.fit(X_train[:, [0, 1, 4]])
X_train[:, [0, 1, 4]] = imputer_train_cat.transform(X_train[:, [0, 1, 4]])

for i in range(0, 614):
    if X_train[:, 2][i] == '3+':
        X_train[:, 2][i] = 3
    else:
        continue

from sklearn.preprocessing import Imputer

imputer_train_num = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer_train_num = imputer_train_num.fit(X_train[:, [2, 7, 8, 9]])
X_train[:, [2, 7, 8, 9]] = imputer_train_num.transform(X_train[:,
                                                               [2, 7, 8, 9]])
Example #6
0
def impute_categorical_features(df, features):
    #impute missing values for categorical features
    cat_imputer = CategoricalImputer()
    cat_imputer.fit(df[features])
    print(cat_imputer.fill_)
    df[features] = cat_imputer.transform(df[features])
import numpy as np
import pandas as pd

train = pd.read_csv("train_ctrUa4K.csv")
test = pd.read_csv("test_lAUu6dG.csv")

X_train = train.iloc[:, 1:12].values
y_train = train.iloc[:, 12].values
X_test = test.iloc[:, 1:12].values

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
onehotencoder = OneHotEncoder()
X_train = onehotencoder.fit_transform(X_train).toarray()
labelencoder_y = LabelEncoder()
y_train = labelencoder_y.fit_transform(y_train)

# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X_train[:, 1:12])
X_train[:, 1:12] = imputer.transform(X_train[:, 1:12])

from sklearn_pandas import CategoricalImputer
categorical_imputer = CategoricalImputer(missing_values='NaN',
                                         strategy='most_frequent')
categorical_imputer = categorical_imputer.fit(X_train[:, ])
Example #8
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("ChronicKidneyDiseaseFull.csv")

nulls_per_column = df.isnull().sum()
nulls_per_column

df_delete_rows = df.dropna(axis=0)
df_delete_rows.shape

df_delete_columns = df.dropna(axis=1)
df_delete_columns.shape

categorical_variables_mask = df.dtypes == object
categorical_variables = df.columns[categorical_variables_mask]
numerical_variables = df.columns[~categorical_variables_mask]

from sklearn_pandas import CategoricalImputer
from sklearn.preprocessing import Imputer

numerical_imputer = Imputer(missing_values="NaN", strategy="median", copy=True)
numerical_imputer.fit(df[numerical_variables])
df_numerical_imputed = numerical_imputer.transform(df[numerical_variables])

categorical_imputer = CategoricalImputer(missing_values="NaN")
categorical_imputer.fit(df[categorical_variables])