def pipeline(number_clusters=8): numerical_pipeline = make_pipeline( DTypeSelector('number'), CorrFilterHighTotalCorrelation(), KNNImputer(n_neighbors=5), StandardScaler() ) object_pipeline = make_pipeline( DTypeSelector('object'), SimpleImputer(strategy='most_frequent'), HashingEncoder(n_components=48) ) return make_pipeline( make_union( numerical_pipeline, object_pipeline, ), KMeans(n_clusters=number_clusters, init='k-means++', n_init=10, max_iter=300, random_state=0) )
def encode(df1, df2): bef1 = df1.columns bef2 = df2.columns numlist = df1.select_dtypes(include=["int64", "float64"]).columns objlist = df1.select_dtypes(include="object").columns onehot_cat = [ "category_id", "category_name", "address_city", "diet", "size", "storage_temp", "weekday", "quarter" ] diff = [x for x in objlist if x not in onehot_cat] categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\ ('encoder', OneHotEncoder(drop='first'))]) categorical_transformer2 = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\ ('encoder', HashingEncoder())]) numeric_transformer = Pipeline(steps=[( 'imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, numlist),('cat', categorical_transformer, onehot_cat),\ ('cat2', categorical_transformer2, diff)]) df1 = preprocessor.fit_transform(df1) df2 = preprocessor.transform(df2) df1 = pd.DataFrame(df1) df2 = pd.DataFrame(df2) return df1, df2
def Hashing_Encoding(self, n_components: int = 8): """ 哈希编码,将任意数量的变量以一定的规则映射到给定数量的变量。特征哈希可能会导致要素之间发生冲突。哈希编码器的大小及复杂程度不随数据类别的增多而增多。 :param n_components: 用来表示特征的位数 :return: """ self.encoder = HashingEncoder(cols=self.cols, n_components=n_components)
def model(estimator, df1): numlist = df1.select_dtypes(include=["int64", "float64"]).columns objlist = df1.select_dtypes(include="object").columns onehot_cat = ["category_id", "category_name", "address_city", "diet", "size", "storage_temp", "weekday", "quarter"] diff = [x for x in objlist if x not in onehot_cat] categorical_transformer = Pipeline(steps=[('imputer1', SimpleImputer(strategy='constant', fill_value='missing')),('encoder1', OneHotEncoder(drop='first'))]) categorical_transformer2 = Pipeline(steps=[('imputer2', SimpleImputer(strategy='constant', fill_value='missing')),('encoder2', HashingEncoder())]) numeric_transformer = Pipeline(steps=[('imputer3', SimpleImputer(strategy='most_frequent'))]) preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numlist),('cat', categorical_transformer, onehot_cat),('cat2', categorical_transformer2, diff)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', estimator)]) return clf
def transform(X): new_encoder = HashingEncoder(hash_method=self.hash_method, n_components=self.n) return np.swapaxes(new_encoder.fit_transform(X).values, 0, 1)
clf.fit(Xtrain, ytrain) print "RuleListClassifier Accuracy:", clf.score( Xtest, ytest), "Learned interpretable model:\n", clf ############################################################################### try: from category_encoders import HashingEncoder except: raise Exception( "Please install category_encoders (pip install category_encoders) for comparing mixed data with Random Forests!" ) from sklearn import pipeline cat_indices = [] for ft in range(len(columns)): if hepatitis_df.columns[ft] not in clf.discretized_features: cat_indices.append(ft) ppl = pipeline.Pipeline([ ('encoder', HashingEncoder(cols=[ 'LIVER_BIG', 'ANTIVIRALS', 'HISTOLOGY', 'SEX', 'STEROID', 'MALAISE', 'FATIGUE', 'SPIDERS', 'VARICES', 'LIVER_FIRM', 'SPLEEN_PALPABLE', 'ASCITES', 'ANOREXIA' ])), ('clf', RandomForestClassifier()) ]) print "RandomForestClassifier Accuracy:", ppl.fit(Xtrain, ytrain).score(Xtest, ytest)
data_ohe_test = onehot_enc.transform(X_test) print('Features after OHE: \n', data_ohe_train.shape[1]) ######### Logistic Regression onehot_logit_score = get_score(logit, data_ohe_train, y_train, data_ohe_test, y_test) print('Logistic Regression score with One hot encoding:', onehot_logit_score) ######### Random Forest onehot_rf_score = get_score(rf, data_ohe_train, y_train, data_ohe_test, y_test) print('Random Forest score with One hot encoding:', onehot_logit_score) ################################################################################################### ######### Apply Hashing Encoding from category_encoders import HashingEncoder hashing_enc = HashingEncoder(n_components=10000, cols=X_Columns) hashing_enc.fit(X_train, y_train) print('Original number of features: \n', X_train.shape[1], "\n") X_train_hashing = hashing_enc.transform(X_train.reset_index(drop=True)) X_test_hashing = hashing_enc.transform(X_test.reset_index(drop=True)) print('Features after OHE: \n', X_train_hashing.shape[1]) ######### Logistic Regression hashing_logit_score = get_score(logit, X_train_hashing, y_train, X_test_hashing, y_test) print('Logistic Regression score with Hashing encoding:', hashing_logit_score) ######### Random Forest hashing_rf_score = get_score(rf, X_train_hashing, y_train, X_test_hashing, y_test)
############################################################################### Xtrain, Xtest, ytrain, ytest = train_test_split(hepatitis_df, y) # split # train classifier (allow more iterations for better accuracy) clf = RuleListClassifier(max_iter=10000, class1label="survival", verbose=False) clf.fit(Xtrain, ytrain) print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf ############################################################################### try: from category_encoders import HashingEncoder except: raise Exception("Please install category_encoders (pip install category_encoders) for comparing mixed data with Random Forests!") from sklearn import pipeline ppl = pipeline.Pipeline([ ('encoder', HashingEncoder(cols=['LIVER_BIG', 'ANTIVIRALS', 'HISTOLOGY', 'SEX', 'STEROID', 'MALAISE', 'FATIGUE', 'SPIDERS', 'VARICES', 'LIVER_FIRM', 'SPLEEN_PALPABLE', 'ASCITES', 'ANOREXIA'])), ('clf', RandomForestClassifier()) ]) # back to dataframes (for HashingEncoder) Xtrain = pd.DataFrame(Xtrain) Xtrain.columns = hepatitis_df.columns Xtest = pd.DataFrame(Xtest) Xtest.columns = hepatitis_df.columns print "RandomForestClassifier Accuracy:", ppl.fit(Xtrain, ytrain).score(Xtest, ytest)