def test_ordinal_encoder(X): enc = OrdinalEncoder() exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64') assert_array_equal(enc.fit_transform(X), exp.astype('float64')) enc = OrdinalEncoder(dtype='int64') assert_array_equal(enc.fit_transform(X), exp)
def test_ordinal_encoder_raise_missing(X): ohe = OrdinalEncoder() with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit(X) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit_transform(X) ohe.fit(X[:1, :]) with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X)
def test_ordinal_encoder_inverse(): X = [['abc', 2, 55], ['def', 1, 55]] enc = OrdinalEncoder() X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) assert_array_equal(enc.inverse_transform(X_tr), exp) # incorrect shape raises X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) msg = re.escape('Shape of the passed X data is not correct') assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): enc = OrdinalEncoder(categories=cats) exp = np.array([[0.], [1.]]) assert_array_equal(enc.fit_transform(X), exp) assert list(enc.categories[0]) == list(cats[0]) assert enc.categories_[0].tolist() == list(cats[0]) # manually specified categories should have same dtype as # the data when coerced from lists assert enc.categories_[0].dtype == cat_dtype # when specifying categories manually, unknown categories should already # raise when fitting enc = OrdinalEncoder(categories=cats) with pytest.raises(ValueError, match="Found unknown categories"): enc.fit(X2)
def get_neighborhood_colors(column: str = "Neighborhood"): # Could also doc olumn = 'MSZoning' x: pd.Series = features[column] enc = OrdinalEncoder() colors = enc.fit_transform(x.to_numpy().reshape([len(x), 1])) return colors
columns=housing_num.columns, index=housing_num.index) # The housing_cat variable is the only categorical variable, so the book examines it and then converts it into # a on hot encoding of various choices. I think it would be better to make it odinal # In[55]: housing[["ocean_proximity"]].head(10) # In[56]: from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder() housing_cat_encoded = ordinal_encoder.fit_transform( housing[["ocean_proximity"]]) # In[57]: housing_cat_encoded[:10] # In[58]: ordinal_encoder.categories_ # There's no easy way to change the categories. You can renumber these values after the encoder has run. # # The book really wants us to use a one-hot array with individual values for each of the categories. # In[59]:
#Since we already stored the incomplete rows in #"sample_incomplete_rows", we're just checking to ensure those values were replaced with the median #Recall: the ".loc" locates values in a Pandas DataFrame <-- see documentation print(housing_tr.loc[sample_incomplete_rows.index.values]) #NOTE: For pushing "bare" repo to Github: $ git remote add origin https://github.com/MSilberberg0619/Machine_Learning_Practice.git #"ocean_proximity" was left out because it's a text attribute and so the median can't be computed #To fix, convert these categories from text to numbers using Scikit-Learn's OrdinalEncoder class housing_cat = housing[["ocean_proximity"]] print(housing_cat.head(10)) ordinal_encoder = OrdinalEncoder() housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) print(housing_cat_encoded) #Can use one-hot encoding to map attributes to categories so the values of the attributes that are more similar #will have similar encoded values #We don't want the model to assume some natural ordering to the data --> could result in poor performance or #unexpected results cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) print(housing_cat_1hot) housing_cat_1hot.toarray() print(housing_cat_1hot) #List of categories using the encoder's categories instance variable print(cat_encoder.categories_)
X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns) housing_tr ### Data Type Conversion - Categorical to Ordinal from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder(categories='auto') ocean_proximity_cat = housing[["ocean_proximity"]] ocean_proximity_cat.head(10) ocean_proximity_ordinal = ordinal_encoder.fit_transform(ocean_proximity_cat) ocean_proximity_ordinal[:10] ordinal_encoder.categories_ pd.DataFrame(ocean_proximity_oridinal).value_counts() - One-hot encoding from sklearn.preprocessing import OneHotEncoder onehot_encoder = OneHotEncoder() ocean_proximity_onehot = onehot_encoder.fit_transform(ocean_proximity_cat) ocean_proximity_onehot ocean_proximity_onehot.toarray()
for i in range(len(agg_labels)): if agg_labels[i] != focus_label: agg_labels[i] = "ANOTHER" print (agg_labels) return agg_labels agg_labels_train = redifine_labels(agg_labels_train, focus_label) agg_labels_dev = redifine_labels(agg_labels_dev, focus_label) from sklearn.preprocessing import OrdinalEncoder ordinal_encoder_train = OrdinalEncoder() agg_labels_train_encoded = ordinal_encoder_train.fit_transform(agg_labels_train) #%% print(agg_labels_train_encoded[:10]) print(ordinal_encoder_train.categories_) ordinal_encoder_dev = OrdinalEncoder() agg_labels_dev_encoded = ordinal_encoder_dev.fit_transform(agg_labels_dev) #%% print(agg_labels_dev_encoded[:10]) print(ordinal_encoder_dev.categories_) #%% from time import time
def main(): path = '../data/persons' data = pd.read_csv(f'{path}/person_data_clean.csv', header=0) cat_cols = [ 'person_type', 'trafficway_type', 'manner_of_collision', 'body_type', 'seating_position', 'ejection', 'safety_equipment_use' ] binary_cols = ['sex', 'land_use_urban', 'rollover', 'air_bag_deployed'] numeric_cols = ['age'] data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category')) labels = data['fatality'] features = data[cat_cols + binary_cols + numeric_cols] # features = pd.get_dummies(features, columns=cat_cols) # features.rename(columns={'manner_of_collision_Not Collision with Motor Vehicle in Transport (Not Necessarily in Transport for\n2005-2009)': 'manner_of_collision_Not Collision with Motor Vehicle in Transport'}, # inplace=True) feature_names = features.columns oe = OrdinalEncoder() features = oe.fit_transform(features) scaler = StandardScaler() features = scaler.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2020) print('Class Balance') print(y_test.value_counts()) print() models = { 'Random Forest': (RandomForestClassifier(n_estimators=100, min_samples_leaf=5, class_weight='balanced', random_state=2020), 'rf'), 'Logistic Regression': (LogisticRegressionCV(cv=5, scoring='f1', class_weight='balanced', max_iter=500, random_state=2020), 'lr') } for name, (model, suffix) in models.items(): print(name) print('-' * 20) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_probs = model.predict_proba(X_test)[:, 1] utils.print_metrics(y_test, y_pred) utils.roc_curve(y_test, y_probs, name, suffix) utils.feature_importance(model, feature_names, name, suffix) utils.permutation_importances(model, X_test, y_test, feature_names, name, suffix) # utils.permutation_importances(model, X_train, y_train, feature_names, name, suffix + '_ohe', dataset='train') print('#' * 50)
def encode_categoricals( data: pd.DataFrame, group_cols: List[str]) -> (pd.DataFrame, OrdinalEncoder): enc = OrdinalEncoder() data[group_cols] = enc.fit_transform(data[group_cols].values) return data, enc
scale_mapper = {"Low": 1, "Medium": 2, "High": 3} #특성을 정수로 변환 dataframe["Score"].replace(scale_mapper) dataframe = pd.DataFrame({ "Score": ["Low", "Low", "Medium", "Medium", "High", "Barely More Than Medium"] }) scale_mapper = {"Low": 1, "Medium": 2, "Barely More Than Medium": 3, "High": 4} dataframe["Score"].replace(scale_mapper) scale_mapper = { "Low": 1, "Medium": 2, "Barely More Than Medium": 2.1, "High": 3 } dataframe["Score"].replace(scale_mapper) from sklearn.preprocessing import OrdinalEncoder features = np.array([["Low", 10], ["High", 50], ["Medium", 3]]) ordinal_encoder = OrdinalEncoder() ordinal_encoder.fit_transform(features) ordinal_encoder.categories_
def test_ordinal_encoder(X): enc = OrdinalEncoder() exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64') assert_array_equal(enc.fit_transform(X), exp.astype('float64')) enc = OrdinalEncoder(dtype='int64') assert_array_equal(enc.fit_transform(X), exp)
# Shuffling users user_vector = shuffled_ratings['user_emb_id'].values print('Users:', user_vector, ', shape =', user_vector.shape) # Shuffling movies movie_vector = shuffled_ratings['movie_emb_id'].values print('Movies:', movie_vector, ', shape =', movie_vector.shape) # Shuffling ratings rating_vector = shuffled_ratings['rating'].values print('Ratings:', rating_vector, ', shape =', rating_vector.shape) enc = OrdinalEncoder() ratings[['age_desc', 'occ_desc']] = enc.fit_transform(ratings[['age_desc', 'occ_desc']]) features = ratings[['age_desc', 'occ_desc', 'movie_id']].values labels = ratings[['rating']].values model = RandomForestRegressor() model.fit(features, labels) # Show the RMSE y_pred = model.predict(features) val_loss = np.sqrt(metrics.mean_squared_error(labels, y_pred)) print('Minimum RMSE {:f}'.format(val_loss)) save_obj(enc, model_path, 'feature_encoder') save_obj(model, model_path, 'rf_recommender')
# %% [markdown] # ## Encoding ordinal categories # # The most intuitive strategy is to encode each category with a different # number. The `OrdinalEncoder` will transform the data in such manner. # We will start by encoding a single column to understand how the encoding # works. # %% from sklearn.preprocessing import OrdinalEncoder education_column = data_categorical[["education"]] encoder = OrdinalEncoder() education_encoded = encoder.fit_transform(education_column) education_encoded # %% [markdown] # We see that each category in `"education"` has been replaced by a numeric # value. We could check the mapping between the categories and the numerical # values by checking the fitted attribute `categories_`. # %% encoder.categories_ # %% [markdown] # Now, we can check the encoding applied on all categorical features. # %% data_encoded = encoder.fit_transform(data_categorical)
i for i, column in enumerate(train.columns) if not is_numeric_dtype(train[column]) ] # Encode categorical data ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan) X_train_copy = X_train.copy() X_test_copy = X_test.copy() for col in categorical: X_train[col].fillna("NaN", inplace=True) X_test[col].fillna("NaN", inplace=True) X_train[categorical] = ordinal_encoder.fit_transform(X_train[categorical]) X_test[categorical] = ordinal_encoder.transform(X_test[categorical]) X_train[X_train_copy.isnull()] = np.nan X_test[X_test_copy.isnull()] = np.nan # Tune max_iters (number of trees) params = { "categorical_features": cat_indices, "scoring": "neg_root_mean_squared_error" } max_iters_param_grid = {"max_iter": range(20, 120, 10)} hgbr_1 = HistGradientBoostingRegressor(**params) g_search = GridSearchCV(hgbr_1, max_iters_param_grid) _ = g_search.fit(X_train, y_train)
print(label) #查看获取的结果label print(le.fit_transform(y)) #也可以直接fit_transform一步到位 print(le.inverse_transform(label)) #使用inverse_transform可以逆转 # In[]: from sklearn.preprocessing import OrdinalEncoder y = data_.iloc[:,-1].reshape(-1,1) ## 接口categories_对应LabelEncoder的接口classes_,一模一样的功能 enc = OrdinalEncoder() enc.fit(y) print(enc.categories_) data_.iloc[:,-1] = enc.transform(y) data_.iloc[:,1:-1] = enc.fit_transform(data_.iloc[:,1:-1]) # 一步到位 # In[]: from sklearn.preprocessing import OneHotEncoder X = data_.iloc[:,1:-1] enc = OneHotEncoder(categories='auto').fit(X) result = enc.transform(X).toarray() #依然可以直接一步到位,但为了给大家展示模型属性,所以还是写成了三步 OneHotEncoder(categories='auto').fit_transform(X).toarray() #依然可以还原 pd.DataFrame(enc.inverse_transform(result))
def main(): np.set_printoptions( threshold=10) # Ndarray display threshold to avoid hiding some columns print('HOUSING_PATH=', HOUSING_PATH) print('HOUSING_URL=', HOUSING_URL) fetch_housing_data(HOUSING_URL, HOUSING_PATH) print('After fetch_housing_data') housing = load_housing_data(HOUSING_PATH) print('After load_housing_data') print(housing.head()) # INFO statement print("\nINFO statement:") print(housing.info()) # Value counts print("\nValue counts:") print(housing["ocean_proximity"].value_counts()) # "describe" statement for summary print("\nDESCRIBE statement:") print(housing.describe()) # Plot data #housing.hist(bins=50,figsize =(20,15)) #plt.show() # Test set sampling - random vs stratification housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5]) housing["income_cat"].hist() #plt.show() # Random test set rand_train_set, rand_test_set = train_test_split(housing, test_size=0.2, random_state=42) # Stratification of data print("\nStratify housing data:") split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) print(split.split(housing, housing["income_cat"])) print(len(list(split.split(housing, housing["income_cat"])))) ic = 0 for train_index, test_index in split.split(housing, housing["income_cat"]): ic += 1 print("ic = ", ic) print(len(train_index), train_index) print(len(test_index), test_index) #sys.exit() strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] strat_full_set = housing #rts = (rand_test_set["income_cat"].value_counts()/len(rand_test_set)).sort_index() #sts = (strat_test_set["income_cat"].value_counts()/len(strat_test_set)).sort_index() #sfs = (strat_full_set["income_cat"].value_counts()/len(strat_full_set)).sort_index() #print('rand_test: \n{0}'.format(rts)) #print('strat_test: \n{0}'.format(sts)) #print('strat_full: \n{0}'.format(sfs)) # Separate predictors and labels print("\nRevert training set:") housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() housing_cat = housing[["ocean_proximity"]] housing_cat_head = housing_cat.head(10) print("housing_cat.head(10) = {}".format(housing_cat.head(10))) #print("housing_cat.head(10) = {}".format(housing_cat_head)) # Simple imputer imputer = SimpleImputer(strategy="median") housing_num_only = housing.drop("ocean_proximity", axis=1) imputer.fit(housing_num_only) print("imputer.statistics_ = {0}".format(imputer.statistics_)) print("housing_num_only.median() = {0}".format(housing_num_only.median())) X = imputer.transform(housing_num_only) housing_tr = pd.DataFrame(X, columns=housing_num_only.columns) print('housing_tr.info() : ') print(housing_tr.info()) ''' Encording ''' # Ordinal encoder : replace categorical attributes into numbers # Issue with this method is the "distance" between the numerical values print("\nOrdinal encoder:") ordinal_encoder = OrdinalEncoder() housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) print("housing_cat_encoded = {0}".format(housing_cat_encoded[:10])) print("ordinal_encoder.categories_ = {0}".format( ordinal_encoder.categories_)) # One-hot encorder: Split categories and label only 0 or 1 # This way can avoid "distance" problem of the ordinal encorder # Output is a SiPy sparse matrix. User toarray() to convert to numpy array print("\nOne-hot encoder:") cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) print("housing_cat_1hot = {0}".format(housing_cat_1hot)) print("housing_cat_1hot.toarray() = {0}".format( housing_cat_1hot.toarray())) # Attribute adder attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # Transformation pipeline num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) print("num_pipeline = {0}".format(type(num_pipeline))) housing_num_tr = num_pipeline.fit_transform(housing_num_only) print("housing_num_tr = {0}".format(housing_num_tr)) num_attribs = list(housing_num_only) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) housing_prepared = full_pipeline.fit_transform(housing) return
df = df[cat_cols].astype("category") print("Revised column data types ", df.columns) # revalue column values # df['salary'] = df['salary'].map({'>50K': "above_50K", '<=50K': "less_equal_50K"}) # df['salary'].replace(to_replace=dict('>50K': 'above_50K', '<=50K':'les_eq_50K'), inplace=True) # df['salary'].replace('>50K','above_50K', inplace=True) # df['salary'].replace('<=50K','less_eq_50K', inplace=True) # df['salary'].replace(['>50K','<=50K'],['above_50K','less_eq_50K'],inplace=True) # print(df.head) # check for high correlated variables # use ordinal encoder to convert categorical to numbers enc = OrdinalEncoder() df[cat_cols] = enc.fit_transform(df[cat_cols]) print(df.head) print(enc.categories_) # Model Building # train_set, validate_set, test_set = train_validate_test_split(df) # print("Train set: ", train_set.shape) # print("Test set: ", test_set.shape) # print("Validate set: ", validate_set.shape) # # print("Dataframe cols: ", df.columns) # X_train_data = train_set[['workclass', 'education', 'marital-status', 'occupation', # 'relationship', 'race', 'sex', 'native-country']].copy() # y_train_label = train_set[['salary']].copy()
min_max_scaler = preprocessing.MinMaxScaler() X_train_minmax = min_max_scaler.fit_transform(X_train) print(X_train_minmax) print("mean", np.mean(X_train_minmax, axis=0)) print("SD", np.std(X_train_minmax, axis=0)) X_test = np.array([[-3., -1., 4.]]) X_test_minmax = min_max_scaler.transform(X_test) X_test_minmax #Task 3 import pandas as pd import numpy as np #convert categorical features to a numerical representation. X = pd.DataFrame( np.array([ 'M', 'O-', 'medium', 'M', 'O-', 'high', 'F', 'O+', 'high', 'F', 'AB', 'low', 'F', 'B+', 'NA' ]).reshape((5, 3))) X.columns = ['sex', 'blood_type', 'edu_level'] print(X) from sklearn.preprocessing import OrdinalEncoder #reqiurement sklearn 0.20 check version! encoder = OrdinalEncoder() X.edu_level = encoder.fit_transform(X.edu_level.values.reshape(-1, 1)) print(X)
def compute_spatial_distribution( test_features, test_labs_true, test_labs_pred, base_features, base_labs, numerical_dist_metric=None, categorical_dist_metric=None, summary="mean", ): """Compute a summary of the pairwise distances between points. This computes pairwise distances between the test points and base points in feature space (ie. how far each test point is from each base point), and returns a summary of the distance for each test point relative to each base class. Parameters ---------- test_features : DataFrame Feature values for the test dataset. test_labs_true : Series True labels for the test dataset. test_labs_pred : Series Labels predicted by a model for the test dataset. base_features: DataFrame Feature values for the base dataset. base_labs : Series True labels for the base dataset. numerical_dist_metric : dict The metrics to use to measure distance between numerical (continuous)-valued columns. This should be a dict mapping column names to strings, each a named metric as accepted by `sklearn.metrics.pairwise_distances` appropriate for continuous data categorical_dist_metric : dict The metrics to use to measure distance between categorical (discrete)-valued columns. This should be a dict mapping column names to strings, each a named metric as accepted by `sklearn.metrics.pairwise_distances` appropriate for discrete data summary : str An aggregation function to apply to a Pandas Grouped object. Only columns listed in the distance metric dists will be included in the distance computation. Returns ------- SpatialDistributionResult """ # Compute a DF of pairwise distances between base datapoints (rows) # and test datapoints (cols). pairwise_dist = None if numerical_dist_metric: # Normalize numeric features to reduce the effect of different scales num_cols = list(numerical_dist_metric.keys()) scaler = StandardScaler() base_scaled = DataFrame( scaler.fit_transform(base_features[num_cols]), columns=num_cols ) test_scaled = DataFrame( scaler.transform(test_features[num_cols]), columns=num_cols ) pairwise_dist = _pairwise_dist(base_scaled, test_scaled, numerical_dist_metric) if categorical_dist_metric: categ_cols = list(categorical_dist_metric.keys()) encoder = OrdinalEncoder() base_encoded = DataFrame( encoder.fit_transform(base_features[categ_cols]), columns=categ_cols ) test_encoded = DataFrame( encoder.transform(test_features[categ_cols]), columns=categ_cols ) pairwise_dist_categ = _pairwise_dist( base_encoded, test_encoded, categorical_dist_metric ) if pairwise_dist is None: pairwise_dist = pairwise_dist_categ else: pairwise_dist += pairwise_dist_categ df_dist = DataFrame( pairwise_dist, index=base_labs.index, columns=test_labs_true.index, ) # Summarize distances within each base dataset class separately for each # test datapoint. # Result is a m x k DF with 1 row for each test datapoint and 1 column for # each base class. df_summ = df_dist.groupby(base_labs).agg(summary).transpose() # Add the test labels to the index for easy reference. df_summ = df_summ.set_index( MultiIndex.from_arrays([test_labs_true, test_labs_pred, df_summ.index]) ) return SpatialDistributionResult( vals=df_summ, dist_metrics_num=numerical_dist_metric, dist_metrics_categ=categorical_dist_metric, summary=summary, )
class NumericTransformer(object): """General purpose numeric conversion for pandas dataframes. All categorical data and levels must be passed to .fit(). If new categorical series or levels are present in .transform() it won't work! Currently datetimes cannot be inverse_transformed back to datetime Args: na_strings (list): list of strings to replace as pd.NA categorical_fillna (str): how to fill NaN for categorical variables (numeric NaN are unaltered) "ffill" - uses forward and backward filling to supply na values "indicator" or anything else currently results in all missing replaced with str "missing_value" handle_unknown (str): passed through to scikit-learn OrdinalEncoder verbose (int): greater than 0 to print some messages """ def __init__( self, na_strings: list = ['', ' '], # 'NULL', 'NA', 'NaN', 'na', 'nan' categorical_fillna: str = "ffill", handle_unknown: str = 'use_encoded_value', verbose: int = 0, ): self.na_strings = na_strings self.verbose = verbose self.categorical_fillna = categorical_fillna self.handle_unknown = handle_unknown self.categorical_flag = False self.needs_transformation = True def _fit(self, df): """Fit categorical to numeric.""" # test if any columns aren't numeric if not isinstance(df, pd.DataFrame): # basically just Series inputs df = pd.DataFrame(df) if df.shape[1] == df.select_dtypes(include=np.number).shape[1]: self.needs_transformation = False if self.verbose > 2: print("All data is numeric, skipping NumericTransformer") if self.needs_transformation: # replace some common nan datatypes from strings to nan df.replace(self.na_strings, np.nan, inplace=True) # pd.NA in future # convert series to numeric which can be readily converted. df = df.apply(pd.to_numeric, errors='ignore') # record which columns are which dtypes self.column_order = df.columns self.numeric_features = df.select_dtypes( include=[np.number] ).columns.tolist() self.categorical_features = list( set(df.columns.tolist()) - set(self.numeric_features) ) if len(self.categorical_features) > 0: self.categorical_flag = True if self.categorical_flag: from sklearn.preprocessing import OrdinalEncoder df_enc = df[self.categorical_features] if self.categorical_fillna == "ffill": df_enc = df_enc.fillna(method='ffill').fillna(method='bfill') df_enc = df_enc.fillna('missing_value') self.cat_transformer = OrdinalEncoder( handle_unknown=self.handle_unknown, unknown_value=np.nan ) # the + 1 makes it compatible with remove_leading_zeroes df_enc = self.cat_transformer.fit_transform(df_enc) + 1 # df_enc = self.cat_transformer.transform(df_enc) + 1 self.cat_max = df_enc.max(axis=0) self.cat_min = df_enc.min(axis=0) if self.verbose > 0: print("Categorical features converted to numeric") df = pd.concat( [ pd.DataFrame( df[self.numeric_features], columns=self.numeric_features ), pd.DataFrame( df_enc, columns=self.categorical_features, index=df.index ), ], axis=1, )[self.column_order] return df.astype(float) def fit(self, df): """Learn behavior of data to change. Args: df (pandas.DataFrame): input dataframe """ self._fit(df) return self def fit_transform(self, df): """Fits and Returns *Magical* DataFrame. Args: df (pandas.DataFrame): input dataframe """ return self._fit(df) def transform(self, df): """Convert categorical dataset to numeric.""" if self.needs_transformation: if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) df.replace(self.na_strings, np.nan, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') if self.categorical_flag: df_enc = (df[self.categorical_features]).fillna(method='ffill') df_enc = df_enc.fillna(method='bfill').fillna('missing_value') df_enc = self.cat_transformer.transform(df_enc) + 1 df = pd.concat( [ pd.DataFrame( df[self.numeric_features], columns=self.numeric_features ), pd.DataFrame( df_enc, columns=self.categorical_features, index=df.index ), ], axis=1, )[self.column_order] try: df = df.astype(float) except ValueError as e: raise ValueError( f"NumericTransformer.transform() could not convert data to float. {str(e)}." ) return df def inverse_transform(self, df, convert_dtypes: bool = False): """Convert numeric back to categorical. Args: df (pandas.DataFrame): df convert_dtypes (bool): whether to use pd.convert_dtypes after inverse """ if self.categorical_flag: if not isinstance(df, pd.DataFrame): # basically just Series inputs df = pd.DataFrame(df) df_enc = ( df[self.categorical_features].clip( upper=self.cat_max, lower=self.cat_min, axis=1 ) - 1 ) df_enc = self.cat_transformer.inverse_transform(df_enc) df = pd.concat( [ pd.DataFrame( df[self.numeric_features], columns=self.numeric_features ), pd.DataFrame( df_enc, columns=self.categorical_features, index=df.index ), ], axis=1, )[self.column_order] if convert_dtypes: df = df.convert_dtypes() return df
from sklearn import tree # Test w/ Iris dataset using my class dataset = load_iris() X, y = dataset.data, dataset.target clf_iris = Decision_Tree(max_depth = 5) # Test to make target class strings instead of integers y = ["one" if val == 1 or val == 2 else "zero" for val in y] y = np.array(y) # Need to ordinally encode strings to integers if "int" not in str(y.dtype): # Reshape y array so it works w/ ordinal encoder y = y.reshape(-1, 1) encoder = OrdinalEncoder() y = encoder.fit_transform(y) y = y.astype(int) y = y.reshape(y.size,) clf_iris.fit(X, y) temp1 = np.array([[3, 2, 1, .5]]) temp2 = np.array([[4, 2.9, 1.3, .2]]) temp3 = np.array([[3.8, 3, 1.4, .4]]) temp4 = np.array([[7.7, 2.8, 6.7, 2]]) #temp1 print("------------------------------------------------------") print(f"My Iris prediction for {temp1}:\n", clf_iris.predict(temp1)) print("------------------------------------------------------") # Test w/ Iris dataset using sklearn
def cluster_clients(k=None, save_centroids=True, save_clusters=True): ''' Runs k-prototypes clustering algorithm on preprocessed dataset :param k: Desired number of clusters :param save_centroids: Boolean indicating whether to save cluster centroids :param save_clusters: Boolean indicating whether to save client cluster assignments :return: A KPrototypes object that describes the best clustering of all the runs ''' cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r')) # Load preprocessed client data try: client_df = pd.read_csv(cfg['PATHS']['CLIENT_DATA']) except FileNotFoundError: print("No file found at " + cfg['PATHS']['CLIENT_DATA'] + ". Running preprocessing of client data.") raw_df = load_raw_data(cfg) client_df = prepare_for_clustering(cfg, raw_df, save_df=False) excluded_feats = cfg['K-PROTOTYPES']['FEATS_TO_EXCLUDE'] client_df.drop(excluded_feats, axis=1, inplace=True) # Features we don't want to see in clustering client_feats_df = client_df.copy() client_ids = client_df.pop('CONTRACT_ACCOUNT').tolist() cat_feats = [f for f in cfg['DATA']['CATEGORICAL_FEATS'] if f not in excluded_feats] bool_feats = [f for f in cfg['DATA']['BOOLEAN_FEATS'] if f not in excluded_feats] ordinal_encoder = OrdinalEncoder() client_df[cat_feats] = ordinal_encoder.fit_transform(client_df[cat_feats]) X = np.array(client_df) # Get list of categorical feature indices. Boolean feats are considered categorical for clustering cat_feat_idxs = [client_df.columns.get_loc(c) for c in cat_feats + bool_feats if c in client_df] numcl_feat_idxs = [i for i in range(len(client_df.columns)) if i not in cat_feat_idxs] # Normalize noncategorical features X_noncat = X[:, numcl_feat_idxs] std_scaler = StandardScaler().fit(X_noncat) X_noncat = std_scaler.transform(X_noncat) X[:, numcl_feat_idxs] = X_noncat # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client if k is None: k = cfg['K-PROTOTYPES']['K'] k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'], n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim, cat_dissim=matching_dissim) client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs) k_prototypes.samples = X k_prototypes.labels = client_clusters k_prototypes.dist = lambda x0, x1: \ k_prototypes.num_dissim(np.expand_dims(x0[numcl_feat_idxs], axis=0), np.expand_dims(x1[numcl_feat_idxs], axis=0)) + \ k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0), np.expand_dims(x1[cat_feat_idxs], axis=0)) client_clusters += 1 # Enforce that cluster labels are integer range of [1, K] clusters_df = pd.DataFrame({'CONTRACT_ACCOUNT': client_ids, 'Cluster Membership': client_clusters}) clusters_df = clusters_df.merge(client_feats_df, on='CONTRACT_ACCOUNT', how='left') clusters_df.set_index('CONTRACT_ACCOUNT') # Get centroids of clusters cluster_centroids = np.empty((k_prototypes.cluster_centroids_[0].shape[0], k_prototypes.cluster_centroids_[0].shape[1] + k_prototypes.cluster_centroids_[1].shape[1])) cluster_centroids[:, numcl_feat_idxs] = k_prototypes.cluster_centroids_[0] # Numerical features cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[1] # Categorical features # Scale noncategorical features of the centroids back to original range centroid_noncat_feats = cluster_centroids[:, numcl_feat_idxs] centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats) cluster_centroids[:, numcl_feat_idxs] = centroid_noncat_feats # Create a DataFrame of cluster centroids centroids_df = pd.DataFrame(cluster_centroids, columns=list(client_df.columns)) for i in range(len(cat_feats)): ordinal_dict = {j: ordinal_encoder.categories_[i][j] for j in range(len(ordinal_encoder.categories_[i]))} centroids_df[cat_feats[i]] = centroids_df[cat_feats[i]].map(ordinal_dict) centroids_df[bool_feats] = centroids_df[bool_feats].round() cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1)) centroids_df.insert(0, 'Cluster', cluster_num_series) # Get fraction of clients in each cluster cluster_freqs = np.bincount(client_clusters) / float(client_clusters.shape[0]) centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100) # Save centroid features and cluster assignments to spreadsheet if save_centroids: centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) if save_clusters: clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) return k_prototypes
print('Removing redundant columns:', redundant_columns) print('Removing useless targets:', other_targets) print('Removing misc columns:', misc_columns) columns_to_remove = redundant_columns + other_targets + misc_columns df.drop(axis='columns', columns=columns_to_remove, inplace=True) ############################################################################### ### Remove NaN columns (with a lot of NaN values) df, log = remove_nan_columns(df, 1 / 2, verbose=False) print(log) ############################################################################### ### Encode categorical features print('Encoding categorical features (ordinal encoding).') my_encoder = OrdinalEncoder() df['flgs'] = my_encoder.fit_transform(df['flgs'].values.reshape(-1, 1)) df['proto'] = my_encoder.fit_transform(df['proto'].values.reshape(-1, 1)) df['sport'] = my_encoder.fit_transform(df['sport'].astype(str).values.reshape( -1, 1)) df['dport'] = my_encoder.fit_transform(df['dport'].astype(str).values.reshape( -1, 1)) df['state'] = my_encoder.fit_transform(df['state'].values.reshape(-1, 1)) print('Objects:', list(df.select_dtypes(['object']).columns)) # In[5]: ############################################################################### ## Quick sanity check ############################################################################### display_general_information(df)
df = pd.read_csv(filename, encoding='gbk') # print(df.columns.tolist()) # 2、特征工具 # 1)特征标识/筛选 intCols = ['年龄', '收入', '家庭人数', '开通月数'] catCols = ['居住地', '婚姻状况', '教育水平', '性别'] target = '套餐类型' y = df[target] # 2)类别变量数字化 from sklearn.preprocessing import OrdinalEncoder enc = OrdinalEncoder(dtype='int') X_ = enc.fit_transform(df[catCols]) dfCats = pd.DataFrame(X_, columns=catCols) # 3)合并 X = pd.concat([dfCats, df[intCols]], axis=1) cols = X.columns.tolist() # 3、训练模型 from xgboost import XGBClassifier model = XGBClassifier( learning_rate=0.01, # n_estimators=3000, max_depth=4, min_child_weight=5,
def transform(self, X): X = pd.DataFrame(X, columns=self.column_names) enc = OrdinalEncoder() X[self.categorical_cols] = enc.fit_transform(X[self.categorical_cols]) return X
csv_save = os.path.join("..", "data", "LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_y_TRAIN.csv") pd.DataFrame(tsr_train_y).to_csv(csv_save, index=False) csv_save = os.path.join("..", "data", "LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_y_VALIDATION.csv") pd.DataFrame(tsr_validation_y).to_csv(csv_save, index=False) csv_save = os.path.join("..", "data", "LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_y_TEST.csv") pd.DataFrame(tsr_test_y).to_csv(csv_save, index=False) ## scale G_X_train scaler = MinMaxScaler() tsr_train_x[continuous] = scaler.fit_transform(tsr_train_x[continuous]) encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=99) tsr_train_x[ordinal_features] = encoder.fit_transform(tsr_train_x[ordinal_features]) ohe = OneHotEncoder(sparse=False, handle_unknown="ignore") nominal_train = ohe.fit_transform(tsr_train_x[nominal_features]) tsr_train_x = pd.concat([tsr_train_x, pd.DataFrame(nominal_train)], axis=1) tsr_train_x = tsr_train_x.drop(nominal_features, axis=1) tsr_train_x.columns = column_names csv_save = os.path.join("..", "data", "LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_X_TRAIN.csv") tsr_train_x.to_csv(csv_save, index=False) ## scale G_X_validation tsr_validation_x[continuous] = scaler.transform(tsr_validation_x[continuous]) tsr_validation_x[ordinal_features] = encoder.transform(tsr_validation_x[ordinal_features])
"Temperature", axis=1) # Opción 2, eliminamos el atributo que contiene valores nulos mean_temp = dataframe["Temperature"].mean() dataframe_op3 = dataframe["Temperature"].fillna( mean_temp) # Opción 3, asignamos el valor medio en los valores nulos """Iniciamos el preprocesamiento de los atributos con valores de texto""" color_cat = dataframe[['Color']] spectral_cat = dataframe[['Spectral_Class']] print(color_cat.head(10)) print(spectral_cat.head(10)) """Importamos la funcionalidad de Scikit-learn""" from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder() color_cat_encoded = ordinal_encoder.fit_transform(color_cat) print(color_cat_encoded[:10]) print(ordinal_encoder.categories_) """Importamos lo necesario para realizar el One Hot Encoding""" from sklearn.preprocessing import OneHotEncoder one_hot_encoder = OneHotEncoder() color_cat_one_hot = one_hot_encoder.fit_transform(color_cat) print(color_cat_one_hot) print(color_cat_one_hot.toarray().shape) print(color_cat_one_hot.toarray()) """Ejemplos de normalización de valores de atributos """
import pandas as pd from tensorflow import keras import streamlit as st import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import OrdinalEncoder df = pd.read_csv('2020_chennai.csv') df.dropna(inplace=True) df.reset_index(drop=True, inplace=True) enc = OrdinalEncoder() df[['Wind']] = enc.fit_transform(df[['Wind']]) def changesymbol(x): symbol = ['F', '%', 'mph', 'in'] for i in symbol: if i in x: return x[:x.find(i)] columns = [ 'Temperature', 'Dew Point', 'Humidity', 'Wind Speed', 'Wind Gust', 'Pressure' ] for clmn in columns: df[clmn] = df[clmn].apply(changesymbol) df[['Temperature', 'Dew Point', 'Humidity', 'Wind', 'Wind Speed', 'Pressure']] = df[[ 'Temperature', 'Dew Point', 'Humidity', 'Wind', 'Wind Speed', 'Pressure'
#read csv def load_data(): df = pd.read_csv("LifeExpectancy.csv") target = df[["Life Expectancy"]] features = df[[ "Gender", "Residential", "Physical Activity (times per week)", "Happiness" ]] return features, target features, target = load_data() #buat object, utk ngubah yang ada di csv yang valuenya bukan angka menjadi angka ordinal_encoder = OrdinalEncoder() features[["Gender", "Residential"]] = ordinal_encoder.fit_transform( features[["Gender", "Residential"]]) #normalisasi data scaler = MinMaxScaler() features = scaler.fit_transform(features) #convert target di csv yang bukan angka menjadi angka one_hot_encoder = OneHotEncoder(sparse=False) #supaya array target = one_hot_encoder.fit_transform(target) #ngesplit train data dan test data train_data, test_data, train_target, test_target = train_test_split( features, target, test_size=0.2) #model architecture layer = {
def HP_regress(data, target, outdir, dataset, ABType=True): ################################################################################ # preprocessing ################################################################################ df = pd.read_csv(data, index_col=0) X = df.iloc[:,:7] y = df[target] if ABType == True: #ohe = OneHotEncoder() #X = ohe.fit_transform(X) ore = OrdinalEncoder(categories=[["H1","H2","H3","H4"],["A","B"],\ [8,16,32,64],[10,100,1000],[1,2,4],["fixed","max"],["L0","L2","L1"]]) X = ore.fit_transform(X) else: X = X.drop(columns=["ABType"]) #ohe = OneHotEncoder() #X = ohe.fit_transform(X) ore = OrdinalEncoder(categories=[["H1","H2","H3","H4"],\ [8,16,32,64],[10,100,1000],[1,2,4],["fixed","max"],["L0","L2","L1"]]) X = ore.fit_transform(X) ################################################################################ # RF ################################################################################ gsc = GridSearchCV(estimator=RandomForestRegressor(),\ param_grid={"max_depth": range(5,11), \ "n_estimators": (500, 1000)},\ cv=5, scoring="neg_mean_absolute_error", verbose=0, n_jobs=-1) gs_result = gsc.fit(X, y) best_params = gs_result.best_params_ print(best_params) rfr = RandomForestRegressor(max_depth=best_params["max_depth"],\ n_estimators=best_params["n_estimators"], random_state=42) cv_scores = cross_val_score(rfr, X, y, cv=10, \ scoring="neg_mean_absolute_error") rfr.fit(X, y) ################################################################################ # feature importances ################################################################################ importances = rfr.feature_importances_ std = np.std([tree.feature_importances_ for tree in rfr.estimators_],\ axis=0) imp_data = [tree.feature_importances_ for tree in rfr.estimators_] #ci = stats.sem(imp_data) * stats.t.ppf(1.95/2., len(imp_data)-1) indices = np.argsort(importances)[::-1] if ABType == True: xlab = np.where(indices==0, "ont. constr.", indices) xlab = np.where(xlab=="1", "rel. norm.", xlab) xlab = np.where(xlab=="2", "dimension", xlab) xlab = np.where(xlab=="3", "learn. rate", xlab) xlab = np.where(xlab=="4", "margin", xlab) xlab = np.where(xlab=="5", "magnitude", xlab) xlab = np.where(xlab=="6", "method", xlab) else: xlab = np.where(indices==0, "ont. constr.", indices) xlab = np.where(xlab=="1", "dimension", xlab) xlab = np.where(xlab=="2", "learn. rate", xlab) xlab = np.where(xlab=="3", "margin", xlab) xlab = np.where(xlab=="4", "magnitude", xlab) xlab = np.where(xlab=="5", "method", xlab) fig1, ax1 = plt.subplots(figsize=(3.6,3.6)) ax1.set_title("10-fold CV NMAE = %.2f, std = %.2f" % (np.mean(cv_scores), np.std(cv_scores)), size=10) ax1.bar(range(X.shape[1]), importances[indices], color="#777777",\ yerr=std[indices], align="center") ax1.set_xticks(list(range(X.shape[1]))) ax1.set_xticklabels(xlab, size=9) ax1.tick_params(axis="x", rotation=45) ax1.set_xlim([-1, X.shape[1]]) ax1.set_ylabel("relative feature importance") ax1.spines["top"].set_visible(False) ax1.spines["bottom"].set_visible(False) ax1.spines["left"].set_visible(False) ax1.spines["right"].set_visible(False) ax1.grid(linestyle=":", color="#777777") fig1.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \ hspace=0.2, wspace=0.2) ################################################################################ # partial dependence plot ################################################################################ X_df = pd.DataFrame(X) df2 = pd.concat([X_df, y], axis=1, sort=False) if ABType == True: df2.columns = ["HType", "ABType", "dimension", "learnFac", "margin", "constr", "LType", target] mod_feats = ["HType", "ABType", "dimension", "learnFac", "margin", "constr", "LType"] else: df2.columns = ["HType", "dimension", "learnFac", "margin", "constr", "LType", target] mod_feats = ["HType", "dimension", "learnFac", "margin", "constr", "LType"] # HType pdp_rfr = pdp.pdp_isolate(model=rfr, \ dataset=df2, \ model_features=mod_feats, \ feature="HType") fig2, ax2 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr, feature_name="ontology constraints", \ center=False, plot_lines=False, \ plot_pts_dist=False, figsize = (3.6,3.6)) ax2["pdp_ax"].set_xticklabels(["H+T", "H+TI", "H+TID", "H+TIDF"]) ax2["pdp_ax"].set_xticks([0, 1, 2, 3]) if target is not "mrank": ax2["pdp_ax"].set_ylim([0, 1]) ax2["pdp_ax"].set_ylabel("predicted metric score") ax2["pdp_ax"].grid(color="#777777") fig2.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \ hspace=0.2, wspace=0.2) # dimension pdp_rfr = pdp.pdp_isolate(model=rfr, \ dataset=df2, \ model_features=mod_feats, \ feature="dimension") fig3, ax3 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr, feature_name="dimension $k$", \ center=False, plot_lines=False, \ plot_pts_dist=False, figsize = (3.6,3.6)) ax3["pdp_ax"].set_xticklabels(["8", "16", "32", "64"]) ax3["pdp_ax"].set_xticks([0, 1, 2, 3]) if target is not "mrank": ax3["pdp_ax"].set_ylim([0, 1]) ax3["pdp_ax"].set_ylabel("predicted metric score") ax3["pdp_ax"].grid(color="#777777") fig3.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \ hspace=0.2, wspace=0.2) # learnFac pdp_rfr = pdp.pdp_isolate(model=rfr, \ dataset=df2, \ model_features=mod_feats, \ feature="learnFac", \ num_grid_points=3) fig4, ax4 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr, feature_name="learning rate \u03BB", \ center=False, plot_lines=False, \ plot_pts_dist=False, figsize = (3.6,3.6)) ax4["pdp_ax"].set_xticklabels(["0.1", "0.01", "0.001"]) ax4["pdp_ax"].set_xticks([0, 1, 2]) if target is not "mrank": ax4["pdp_ax"].set_ylim([0, 1]) ax4["pdp_ax"].set_ylabel("predicted metric score") ax4["pdp_ax"].grid(color="#777777") fig4.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \ hspace=0.2, wspace=0.2) # margin pdp_rfr = pdp.pdp_isolate(model=rfr, \ dataset=df2, \ model_features=mod_feats, \ feature="margin", \ num_grid_points=3) fig5, ax5 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr, feature_name="margin $\gamma$", \ center=False, plot_lines=False, \ plot_pts_dist=False, figsize = (3.6,3.6)) ax5["pdp_ax"].set_xticklabels(["1", "2", "4"]) ax5["pdp_ax"].set_xticks([0, 1, 2]) if target is not "mrank": ax5["pdp_ax"].set_ylim([0, 1]) ax5["pdp_ax"].set_ylabel("predicted metric score") ax5["pdp_ax"].grid(color="#777777") fig5.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \ hspace=0.2, wspace=0.2) # constr pdp_rfr = pdp.pdp_isolate(model=rfr, \ dataset=df2, \ model_features=mod_feats, \ feature="constr") fig6, ax6 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr, feature_name="regularisation magnitude", \ center=False, plot_lines=False, \ plot_pts_dist=False, figsize = (3.6,3.6)) ax6["pdp_ax"].set_xticklabels(["surface", "space"]) ax6["pdp_ax"].set_xticks([0, 1]) if target is not "mrank": ax6["pdp_ax"].set_ylim([0, 1]) ax6["pdp_ax"].set_ylabel("predicted metric score") ax6["pdp_ax"].grid(color="#777777") fig6.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \ hspace=0.2, wspace=0.2) # LType pdp_rfr = pdp.pdp_isolate(model=rfr, \ dataset=df2, \ model_features=mod_feats, \ feature="LType", \ num_grid_points=3) fig7, ax7 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr, feature_name="training method", \ center=False, plot_lines=False, \ plot_pts_dist=False, figsize = (3.6,3.6)) ax7["pdp_ax"].set_xticklabels(["linear", "projection", "hybrid"]) ax7["pdp_ax"].set_xticks([0, 1, 2]) if target is not "mrank": ax7["pdp_ax"].set_ylim([0, 1]) ax7["pdp_ax"].set_ylabel("predicted metric score") ax7["pdp_ax"].grid(color="#777777") fig7.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \ hspace=0.2, wspace=0.2) # ABType (if true) if ABType == True: pdp_rfr = pdp.pdp_isolate(model=rfr, \ dataset=df2, \ model_features=mod_feats, \ feature="ABType") fig8, ax8 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr, feature_name="relation normalisation", \ center=False, plot_lines=False, \ plot_pts_dist=False, figsize = (3.6,3.6)) ax8["pdp_ax"].set_xticklabels(["False", "True"]) ax8["pdp_ax"].set_xticks([0, 1]) if target is not "mrank": ax8["pdp_ax"].set_ylim([0, 1]) ax8["pdp_ax"].set_ylabel("predicted metric score") ax8["pdp_ax"].grid(color="#777777") fig8.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \ hspace=0.2, wspace=0.2) #plt.show() ################################################################################ # save figures ################################################################################ subdir = "/".join([outdir, dataset, target]) if not os.path.exists(subdir): os.makedirs(subdir) fig1_path = "/".join([subdir, "_".join([dataset, target, "RF_importance.png"])]) fig1.savefig(fig1_path) fig2_path = "/".join([subdir, "_".join([dataset, target, "PDP_HType.png"])]) fig2.savefig(fig2_path) fig3_path = "/".join([subdir, "_".join([dataset, target, "PDP_dimension.png"])]) fig3.savefig(fig3_path) fig4_path = "/".join([subdir, "_".join([dataset, target, "PDP_learnFac.png"])]) fig4.savefig(fig4_path) fig5_path = "/".join([subdir, "_".join([dataset, target, "PDP_margin.png"])]) fig5.savefig(fig5_path) fig6_path = "/".join([subdir, "_".join([dataset, target, "PDP_constr.png"])]) fig6.savefig(fig6_path) fig7_path = "/".join([subdir, "_".join([dataset, target, "PDP_LType.png"])]) fig7.savefig(fig7_path) if ABType == True: fig8_path = "/".join([subdir, "_".join([dataset, target, "PDP_ABType.png"])]) fig8.savefig(fig8_path)
def redifine_labels(agg_labels, focus_label): for i in range(len(agg_labels)): if agg_labels[i] != focus_label: agg_labels[i] = "OTHER" print(agg_labels) return agg_labels focus_label = 'OAG' agg_labels = redifine_labels(agg_labels, focus_label) from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder() agg_labels_encoded = ordinal_encoder.fit_transform(agg_labels) #%% print(agg_labels_encoded[:10]) print(ordinal_encoder.categories_) #%% from pprint import pprint from time import time from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV #%%
# In[15]: #filling the missing values and removing the non numeric attribute X[X == np.inf] = np.nan X.fillna(X.mean(), inplace=True) # In[16]: X.head() # In[17]: from sklearn.preprocessing import OrdinalEncoder ord_encode = OrdinalEncoder() X_cat = X[["IsHoliday_y"]] X_cat_encoded = ord_encode.fit_transform(X_cat) X_cat_encoded # In[18]: X_num = X.drop("IsHoliday_y", axis=1) X_num.head() # In[20]: X["IsHoliday"] = X_cat_encoded # In[21]: X = X.drop("IsHoliday_y", axis=1) X.head()