コード例 #1
0
def test_not_fitted():
    """
    If imputer is not fitted, NotFittedError is raised.
    """
    imp = CategoricalImputer()
    with pytest.raises(NotFittedError):
        imp.transform(np.array(['a', 'b', 'b', None]))
コード例 #2
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        logger.debug(f"Running {__name__}")

        # determine columns to operate on
        cols = distil_utils.get_operating_columns(
            inputs, self.hyperparams["use_columns"], CATEGORICALS)

        logger.debug(f"Found {len(cols)} categorical columns to evaluate")

        if len(cols) is 0:
            return base.CallResult(inputs)

        imputer = CategoricalImputer(
            strategy=self.hyperparams["strategy"],
            fill_value=self.hyperparams["fill_value"],
            missing_values="",
            tie_breaking="first",
        )
        outputs = inputs.copy()
        failures: List[int] = []
        for c in cols:
            input_col = inputs.iloc[:, c]
            try:
                imputer.fit(input_col)
                result = imputer.transform(input_col)
                outputs.iloc[:, c] = result
            except ValueError as e:
                # value error gets thrown when all data is missing
                if not self.hyperparams["error_on_empty"]:
                    failures.append(c)
                else:
                    raise e

        # for columns that failed using 'most_frequent' try again using 'constant'
        if not self.hyperparams["error_on_empty"]:
            imputer = CategoricalImputer(
                strategy="constant",
                fill_value=self.hyperparams["fill_value"],
                missing_values="",
                tie_breaking="first",
            )
            for f in failures:
                outputs_col = outputs.iloc[:, f]
                imputer.fit(outputs_col)
                result = imputer.transform(outputs_col)
                outputs.iloc[:, f] = result

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
コード例 #3
0
def imputacion_variable_delegacion(X_train, X_test):
    " Esta funcion imputa la variable 'delegacion_inicio' con la moda "

    #Para el set de entrenamiento
    X = X_train.delegacion_inicio.values.reshape(X_train.shape[0], 1)
    delegacionInicio_imputer = CategoricalImputer(strategy='most_frequent')
    X_train['delegacion_inicio'] = delegacionInicio_imputer.fit_transform(X)

    #Para el set de prueba
    X = X_test.delegacion_inicio.values.reshape(X_test.shape[0], 1)
    X_test['delegacion_inicio'] = delegacionInicio_imputer.transform(X)

    return X_train, X_test
コード例 #4
0
ファイル: stacking.py プロジェクト: Gani024/Gani024
titanic_test.shape

titanic_all = pd.concat([titanic_train, titanic_test])
titanic_all.shape
titanic_all.info()

#impute missing values for continuous features
imputable_cont_features = ['Age','Fare']
cont_imputer = preprocessing.Imputer()
cont_imputer.fit(titanic_all[imputable_cont_features])
titanic_all[imputable_cont_features] = cont_imputer.transform(titanic_all[imputable_cont_features])

#impute missing values for categorical features
cat_imputer = CategoricalImputer()
cat_imputer.fit(titanic_all['Embarked'])
titanic_all['Embarked'] = cat_imputer.transform(titanic_all['Embarked'])

titanic_all['FamilySize'] = titanic_all['SibSp'] +  titanic_all['Parch'] + 1

def convert_family_size(size):
    if(size == 1): 
        return 'Single'
    elif(size <=3): 
        return 'Small'
    elif(size <= 6): 
        return 'Medium'
    else: 
        return 'Large'
titanic_all['FamilyCategory'] = titanic_all['FamilySize'].map(convert_family_size)

def extract_title(name):
コード例 #5
0
print(titanic_train.info())

#preprocessing stage
#impute missing values for continuous features
imputable_cont_features = ['Age']
cont_imputer = preprocessing.Imputer()
cont_imputer.fit(titanic_train[imputable_cont_features])
print(cont_imputer.statistics_)
titanic_train[imputable_cont_features] = cont_imputer.transform(
    titanic_train[imputable_cont_features])

#impute missing values for categorical features
cat_imputer = CategoricalImputer()
cat_imputer.fit(titanic_train['Embarked'])
print(cat_imputer.fill_)
titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked'])

le_embarked = preprocessing.LabelEncoder()
le_embarked.fit(titanic_train['Embarked'])
print(le_embarked.classes_)
titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked'])

le_sex = preprocessing.LabelEncoder()
le_sex.fit(titanic_train['Sex'])
print(le_sex.classes_)
titanic_train['Sex'] = le_sex.transform(titanic_train['Sex'])

features = ['Pclass', 'Parch', 'SibSp', 'Age', 'Fare', 'Embarked', 'Sex']
X_train = titanic_train[features]
y_train = titanic_train['Survived']
#create an instance of decision tree classifier type
コード例 #6
0
#data['Item_Weight'] = data[['Item_Weight','Item_Identifier']].apply(impute_weight,axis=1).astype(float)
imputer= Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer = imputer.fit(data[['Item_Weight']])
print(data[['Item_Weight']])
data[['Item_Weight']] = imputer.transform(data[['Item_Weight']])

print ('Final #missing: %d'%sum(data['Item_Weight'].isnull()))
       
#Imputing Outlet_Size missing values with the mode
#Determining the mode

outlet_size_mode= data.pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc=lambda x:x.mode())
imputer1= CategoricalImputer(missing_values='NaN',strategy='most_frequent')
imputer1 = imputer1.fit(data['Outlet_Size'])
print(data['Outlet_Size'])
data[['Outlet_Size']] = imputer1.transform(data[['Outlet_Size']])
print(data['Outlet_Size'])

#Feature Engineering
#checking whether we should combine Outlet_Type or not

data.pivot_table(values='Item_Outlet_Sales',columns='Outlet_Type')

#values are significantly different so leave these

#Considering 0 item_visibility as missing we should impute missing values for these data
print(data[['Item_Visibility']])
#print ('Final #zeros: %d'%sum(data['Item_Visibility'] == 0))
data['Item_Visibility'].replace([data['Item_Visibility']==0],np.NaN)
imputer2= Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer2 = imputer2.fit(data[['Item_Visibility']])
コード例 #7
0
#Import dataset
df_train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')

#splitting dataset into training and test set
X_train = df_train.iloc[:, 1:-1].values
y_train = df_train.iloc[:, 12].values
X_test = df_test.iloc[:, 1:].values

#Missing values
#--------------training set---------
from sklearn_pandas import CategoricalImputer

imputer_train_cat = CategoricalImputer()
imputer_train_cat = imputer_train_cat.fit(X_train[:, [0, 1, 4]])
X_train[:, [0, 1, 4]] = imputer_train_cat.transform(X_train[:, [0, 1, 4]])

for i in range(0, 614):
    if X_train[:, 2][i] == '3+':
        X_train[:, 2][i] = 3
    else:
        continue

from sklearn.preprocessing import Imputer

imputer_train_num = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer_train_num = imputer_train_num.fit(X_train[:, [2, 7, 8, 9]])
X_train[:, [2, 7, 8, 9]] = imputer_train_num.transform(X_train[:,
                                                               [2, 7, 8, 9]])

#--------------test set------------------
コード例 #8
0
def impute_categorical_features(df, features):
    #impute missing values for categorical features
    cat_imputer = CategoricalImputer()
    cat_imputer.fit(df[features])
    print(cat_imputer.fill_)
    df[features] = cat_imputer.transform(df[features])
コード例 #9
0
titanic_train = pd.read_csv(
    'C:\\Users\\tauseef.ur.rahman\\Desktop\\Python-Docs\\Titanic\\train.csv')
print(titanic_train.info())

#Continous Imputer
cont_impute_feature = ['Age', 'Fare']
cont_imputer = preprocessing.Imputer()
cont_imputer.fit(titanic_train[cont_impute_feature])
titanic_train[cont_impute_feature] = cont_imputer.transform(
    titanic_train[cont_impute_feature])

#Categorical Imputer
Cat_imputer = CategoricalImputer()
Cat_imputer.fit(titanic_train['Embarked'])
titanic_train['Embarked'] = Cat_imputer.transform(titanic_train['Embarked'])

#label Encoding
le_embarked = preprocessing.LabelEncoder()
le_embarked.fit(titanic_train['Embarked'])
titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked'])

le_Sex = preprocessing.LabelEncoder()
le_Sex.fit(titanic_train['Sex'])
titanic_train['Sex'] = le_Sex.transform(titanic_train['Sex'])

le_Pclass = preprocessing.LabelEncoder()
le_Pclass.fit(titanic_train['Pclass'])
titanic_train['Pclass'] = le_Pclass.transform(titanic_train['Pclass'])

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']