def pipeline(number_clusters=8):

    numerical_pipeline = make_pipeline(
        DTypeSelector('number'),
        CorrFilterHighTotalCorrelation(),
        KNNImputer(n_neighbors=5),
        StandardScaler()
    )

    object_pipeline = make_pipeline(
        DTypeSelector('object'),
        SimpleImputer(strategy='most_frequent'),
        HashingEncoder(n_components=48)
    )

    return make_pipeline(
        make_union(
            numerical_pipeline,
            object_pipeline,
        ),
        KMeans(n_clusters=number_clusters,
               init='k-means++',
               n_init=10,
               max_iter=300,
               random_state=0)
    )
Example #2
0
def encode(df1, df2):

    bef1 = df1.columns
    bef2 = df2.columns
    numlist = df1.select_dtypes(include=["int64", "float64"]).columns
    objlist = df1.select_dtypes(include="object").columns
    onehot_cat = [
        "category_id", "category_name", "address_city", "diet", "size",
        "storage_temp", "weekday", "quarter"
    ]
    diff = [x for x in objlist if x not in onehot_cat]


    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\
                                              ('encoder', OneHotEncoder(drop='first'))])

    categorical_transformer2 = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\
                                               ('encoder', HashingEncoder())])

    numeric_transformer = Pipeline(steps=[(
        'imputer',
        SimpleImputer(missing_values=np.nan, strategy='most_frequent'))])


    preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numlist),('cat', categorical_transformer, onehot_cat),\
                  ('cat2', categorical_transformer2, diff)])

    df1 = preprocessor.fit_transform(df1)
    df2 = preprocessor.transform(df2)

    df1 = pd.DataFrame(df1)
    df2 = pd.DataFrame(df2)

    return df1, df2
Example #3
0
 def Hashing_Encoding(self, n_components: int = 8):
     """
     哈希编码,将任意数量的变量以一定的规则映射到给定数量的变量。特征哈希可能会导致要素之间发生冲突。哈希编码器的大小及复杂程度不随数据类别的增多而增多。
     :param n_components: 用来表示特征的位数
     :return:
     """
     self.encoder = HashingEncoder(cols=self.cols,
                                   n_components=n_components)
def model(estimator, df1):
    numlist = df1.select_dtypes(include=["int64", "float64"]).columns
    objlist = df1.select_dtypes(include="object").columns
    onehot_cat = ["category_id", "category_name", "address_city", "diet", "size", "storage_temp", "weekday", "quarter"]
    diff = [x for x in objlist if x not in onehot_cat]
    
    categorical_transformer = Pipeline(steps=[('imputer1', SimpleImputer(strategy='constant', fill_value='missing')),('encoder1', OneHotEncoder(drop='first'))])  
    categorical_transformer2 = Pipeline(steps=[('imputer2',  SimpleImputer(strategy='constant', fill_value='missing')),('encoder2', HashingEncoder())])
    numeric_transformer = Pipeline(steps=[('imputer3', SimpleImputer(strategy='most_frequent'))])
    
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numlist),('cat', categorical_transformer, onehot_cat),('cat2', categorical_transformer2, diff)])
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', estimator)])
    
    
    return clf
Example #5
0
 def transform(X):
     new_encoder = HashingEncoder(hash_method=self.hash_method, n_components=self.n)
     return np.swapaxes(new_encoder.fit_transform(X).values, 0, 1)
clf.fit(Xtrain, ytrain)

print "RuleListClassifier Accuracy:", clf.score(
    Xtest, ytest), "Learned interpretable model:\n", clf

###############################################################################

try:
    from category_encoders import HashingEncoder
except:
    raise Exception(
        "Please install category_encoders (pip install category_encoders) for comparing mixed data with Random Forests!"
    )
from sklearn import pipeline

cat_indices = []
for ft in range(len(columns)):
    if hepatitis_df.columns[ft] not in clf.discretized_features:
        cat_indices.append(ft)

ppl = pipeline.Pipeline([
    ('encoder',
     HashingEncoder(cols=[
         'LIVER_BIG', 'ANTIVIRALS', 'HISTOLOGY', 'SEX', 'STEROID', 'MALAISE',
         'FATIGUE', 'SPIDERS', 'VARICES', 'LIVER_FIRM', 'SPLEEN_PALPABLE',
         'ASCITES', 'ANOREXIA'
     ])), ('clf', RandomForestClassifier())
])

print "RandomForestClassifier Accuracy:", ppl.fit(Xtrain,
                                                  ytrain).score(Xtest, ytest)
Example #7
0
data_ohe_test = onehot_enc.transform(X_test)
print('Features after OHE: \n', data_ohe_train.shape[1])

######### Logistic Regression
onehot_logit_score = get_score(logit, data_ohe_train, y_train, data_ohe_test,
                               y_test)
print('Logistic Regression score with One hot encoding:', onehot_logit_score)

######### Random Forest
onehot_rf_score = get_score(rf, data_ohe_train, y_train, data_ohe_test, y_test)
print('Random Forest score with One hot encoding:', onehot_logit_score)

###################################################################################################
######### Apply Hashing Encoding
from category_encoders import HashingEncoder
hashing_enc = HashingEncoder(n_components=10000, cols=X_Columns)
hashing_enc.fit(X_train, y_train)

print('Original number of features: \n', X_train.shape[1], "\n")
X_train_hashing = hashing_enc.transform(X_train.reset_index(drop=True))
X_test_hashing = hashing_enc.transform(X_test.reset_index(drop=True))
print('Features after OHE: \n', X_train_hashing.shape[1])

######### Logistic Regression
hashing_logit_score = get_score(logit, X_train_hashing, y_train,
                                X_test_hashing, y_test)
print('Logistic Regression score with Hashing encoding:', hashing_logit_score)

######### Random Forest
hashing_rf_score = get_score(rf, X_train_hashing, y_train, X_test_hashing,
                             y_test)
###############################################################################

Xtrain, Xtest, ytrain, ytest = train_test_split(hepatitis_df, y) # split

# train classifier (allow more iterations for better accuracy)
clf = RuleListClassifier(max_iter=10000, class1label="survival", verbose=False)
clf.fit(Xtrain, ytrain)

print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf

###############################################################################

try:
    from category_encoders import HashingEncoder
except:
    raise Exception("Please install category_encoders (pip install category_encoders) for comparing mixed data with Random Forests!")
from sklearn import pipeline

ppl = pipeline.Pipeline([
    ('encoder', HashingEncoder(cols=['LIVER_BIG', 'ANTIVIRALS', 'HISTOLOGY', 'SEX', 'STEROID', 'MALAISE', 'FATIGUE', 'SPIDERS', 'VARICES', 'LIVER_FIRM', 'SPLEEN_PALPABLE', 'ASCITES', 'ANOREXIA'])),
    ('clf', RandomForestClassifier())
])

# back to dataframes (for HashingEncoder)
Xtrain = pd.DataFrame(Xtrain)
Xtrain.columns = hepatitis_df.columns
Xtest = pd.DataFrame(Xtest)
Xtest.columns = hepatitis_df.columns

print "RandomForestClassifier Accuracy:", ppl.fit(Xtrain, ytrain).score(Xtest, ytest)