def test_ordinal_encoder(X): enc = OrdinalEncoder() exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64') assert_array_equal(enc.fit_transform(X), exp.astype('float64')) enc = OrdinalEncoder(dtype='int64') assert_array_equal(enc.fit_transform(X), exp)
def test_ordinal_encoder_raise_categories_shape(): X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T cats = ['Low', 'Medium', 'High'] enc = OrdinalEncoder(categories=cats) msg = ("Shape mismatch: if categories is an array,") with pytest.raises(ValueError, match=msg): enc.fit(X)
def test_ordinal_encoder_inverse(): X = [['abc', 2, 55], ['def', 1, 55]] enc = OrdinalEncoder() X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) assert_array_equal(enc.inverse_transform(X_tr), exp) # incorrect shape raises X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) msg = re.escape('Shape of the passed X data is not correct') assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): enc = OrdinalEncoder(categories=cats) exp = np.array([[0.], [1.]]) assert_array_equal(enc.fit_transform(X), exp) assert list(enc.categories[0]) == list(cats[0]) assert enc.categories_[0].tolist() == list(cats[0]) # manually specified categories should have same dtype as # the data when coerced from lists assert enc.categories_[0].dtype == cat_dtype # when specifying categories manually, unknown categories should already # raise when fitting enc = OrdinalEncoder(categories=cats) with pytest.raises(ValueError, match="Found unknown categories"): enc.fit(X2)
def test_ordinal_encoder_raise_missing(X): ohe = OrdinalEncoder() with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit(X) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit_transform(X) ohe.fit(X[:1, :]) with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X)
CLASSIFIERS.append(("XGBClassifier", xgboost.XGBClassifier)) CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier)) # CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier)) numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer( strategy="mean")), ("scaler", StandardScaler())]) categorical_transformer_low = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("encoding", OneHotEncoder(handle_unknown="ignore", sparse=False)), ]) categorical_transformer_high = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), # 'OrdianlEncoder' Raise a ValueError when encounters an unknown value. Check https://github.com/scikit-learn/scikit-learn/pull/13423 ("encoding", OrdinalEncoder()), ]) # Helper function def get_card_split(df, cols, n=11): """ Splits categorical columns into 2 lists based on cardinality (i.e # of unique values) Parameters ---------- df : Pandas DataFrame DataFrame from which the cardinality of the columns is calculated. cols : list-like Categorical columns to list n : int, optional (default=11)
1. sklearn.preprocessing.OrdinalEncoder - Takes an array-like of strings or integers and creates an # encoder to transform the data into an array of integer categories. # sklearn.preprocessing.OneHotEncoder - Takes nominal data in an array-like and encodes into a binary array with # one place per feature. #Extended Exercise #1. Unsure, though it looks like if you 'fit()' a dataset and it's NOT already ordered correctly the function # will categorise the data, but not necessarily in the order you want? #2. Using Customer survey data for value of primary residence from University of Michigan. %run setup.ipy import quandl import my_secrets quandl.ApiConfig.api_key = my_secrets.QUANDL_API_KEY housing_prices = quandl.get("UMICH/SOC22-University-of-Michigan-Consumer-Survey-Current-Market-Value-of-Primary-Residence") housing_prices = housing_prices.iloc[:,0:6] #Trim data down to price categories only housing_prices from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder ord_enc = OrdinalEncoder() ord_enc.fit(housing_prices) ord_prices = ord_enc.transform(housing_prices) hot_enc = OneHotEncoder(categories='auto') hot_enc.fit(housing_prices) hot_enc.transform(housing_prices).toarray()
#Transform and pipeline onehot_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='None') ), ('encode', OneHotEncoder(handle_unknown='ignore'))]) numeric_transformer = Pipeline( steps=[('imputer', IterativeImputer(KNeighborsRegressor( n_neighbors=3))), ('scaler', StandardScaler())]) preprocess = ColumnTransformer(transformers=[( 'onehot', onehot_transformer, onehot_columns), ( 'num', numeric_transformer, numeric_columns), ('binary', OrdinalEncoder(), binary_columns)]) pipeline = Pipeline(steps=[('preprocess', preprocess), ('mod', model)]) #Fit the current model cv = RepeatedKFold(n_splits=5, n_repeats=10) full_pipeline = GridSearchCV(pipeline, param_grid=params, cv=cv, n_jobs=-1, verbose=1) print('Fitting', name, 'model **************************************************') full_pipeline.fit(X, Y)
def __init__( self, regs_path: Path, chag_path: Path, regs_min: Optional[int] = 4, ) -> None: # Read dataframe regs_df = pd.read_csv(regs_path) # Print information print(f"Read dataset in {regs_path}") print(f"Original regs shape: {regs_df.shape}") # get counting information regs_counts = regs_df['registant'].value_counts() chag_counts = regs_df['challengeId'].value_counts() print(f"Original registants size: {regs_counts.size}") print(f"Original challenges size: {chag_counts.size}") # remove sparse item in counts regs_counts = regs_counts[regs_counts >= regs_min] # Remove sparse item regs_df = regs_df[regs_df['registant'].isin(regs_counts.index)] print(f"Filter dataframe shape: {regs_df.shape}") # Add previous and period columns regs_df = regs_df.sort_values(by=['registant', 'date']) regs_df['previousId'] = regs_df['challengeId'] regs_df['period'] = regs_df['date'].str[:7] # Shift previous column regs_df['previousId'] = regs_df['previousId'].shift( periods=1).fillna(0).astype('int64') # Set first item non for each user regs_df = regs_df.sort_values(by=['registant', 'date']) first_mask = regs_df.duplicated(subset=['registant'], keep='first') regs_df['previousId'] = regs_df['previousId'].where(first_mask, -1) # Read attr dataframe chag_df: pd.DataFrame = regs_df[['challengeId', 'period']] chag_df = chag_df.drop_duplicates(subset=['challengeId']) attr_df = pd.read_csv(chag_path, converters={ 'technologies': literal_eval, 'platforms': literal_eval }) chag_df = pd.merge(left=chag_df, right=attr_df, how='inner', on=['challengeId']) # Add default row print(chag_df.columns) chag_df.loc[-1] = (-1, '2005-01', '2005-01-01', 0, [], []) chag_df = chag_df.sort_values(by=['date']) # Add encoder chag_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore') regs_encoder = OneHotEncoder(categories='auto', handle_unknown='error') period_encoder = OrdinalEncoder(categories='auto') tech_binarizer = MultiLabelBinarizer(sparse_output=True) plat_binarizer = MultiLabelBinarizer(sparse_output=True) chag_encoder.fit(regs_df[['challengeId']]) regs_encoder.fit(regs_df[['registant']]) period_encoder.fit(chag_df[['period']]) tech_binarizer.fit(chag_df['technologies'].tolist()) plat_binarizer.fit(chag_df['platforms'].tolist()) # Split dataset to train, valid, test regs_df = regs_df.sort_values(by=['date']) last_mask = regs_df.duplicated(subset=['registant'], keep='last') remain_df = regs_df[last_mask] test_df = regs_df[~last_mask] last_mask = remain_df.duplicated(subset=['registant'], keep='last') train_df = remain_df[last_mask] valid_df = remain_df[~last_mask] # Add default config self.config_db() self._chag_df = chag_df self._df_dict: Dict[str, pd.DataFrame] = { 'train': train_df, 'valid': valid_df, 'test': test_df } self._regs_encoder = regs_encoder self._chag_encoder = chag_encoder self._period_encoder = period_encoder self._tech_binarizer = tech_binarizer self._plat_binarizer = plat_binarizer regs_size = regs_encoder.categories_[0].size chag_size = chag_encoder.categories_[0].size seq_size = tech_binarizer.classes_.size + plat_binarizer.classes_.size self.feat_dim = regs_size + 2 * chag_size + 2 * seq_size self.user_size = regs_size
test_original = mlib.csv_to_df(path) test_df = test_original.copy() # # Create list of features desired for training feature_list = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked'] target_list = ['Survived'] # Define Numeric Pipeline num_pipe = Pipeline([('imputer_mean', SimpleImputer(strategy='mean')), ('std_scalar', StandardScaler())]) # Define Categorical Pipeline cat_pipe = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), # ('ohe' , OneHotEncoder()), ('oe', OrdinalEncoder()) ]) #Combining Pipes into full pipeline - Train Data full_pipeline, train_features, target_features, post_trans_train_feature = mlib.Full_PipeLine( train_df, feature_list, target_list, num_pipe, cat_pipe) # Combining Pipes into full pipeline - Test Data full_pipeline_test, test_features, empty, post_transform_test_features = mlib.Full_PipeLine( test_df, feature_list, [], num_pipe, cat_pipe) # Transform data using final combined pipeline - Train train_features_prep = full_pipeline.fit_transform(train_features) # Transform data using final combined pipeline - Test test_features_prep = full_pipeline.fit_transform(test_features)
def transform_columns(X_train, X_test, column_dict, cat_trans="onehot_encoding", num_trans="standard_scaling"): """ Transforms categorical and numerical features based on user input. Arguments --------- X_train: pandas.core.frame.DataFrame A pandas dataframe for training set X_test: pandas.core.frame.DataFrame A pandas dataframe for test set column_dict: dictionary A dictionary with keys = 'numeric','categorical' and values = a list of columns that fall into each respective category. cat_trans: list transformation method for categorical features(default - 'ohe') num_trans: list transformation method for numerical features (default - 'StandardScaler') Returns ------- dict A python dictionary with transformed training and test set with keys X_train and X_test respectively Examples -------- df_train = pd.DataFrame({'a':[1, 2, 3], 'b':[1.2, 3.4, 3.0], 'c':['A','B','C']}) df_test = pd.DataFrame({'a':[6, 2], 'b':[0.5, 9.2], 'c':['B', 'B']}) transform_columns(df_train, df_test, {'numeric':['a', 'b'], 'categorical':['c']}) """ # checking user inputs # assertions for test and train set inputs assert isinstance(X_train, pd.DataFrame), "X_train should be a DataFrame" assert isinstance(X_test, pd.DataFrame), "X_test should be a DataFrame" assert not isinstance(X_train.columns, pd.RangeIndex), \ "column names must be strings" assert not isinstance(X_test.columns, pd.RangeIndex), \ "column names must be strings" # assertions for dictionary input assert isinstance(column_dict, dict),\ "column_dict should be a python dictionary" assert len(column_dict) == 2, \ "column_dict should have 2 keys - 'numeric' and 'categorical'" for key in column_dict.keys(): assert key in ['numeric', 'categorical'],\ "column_dict keys can be only 'numeric' and 'categorical'" # assertions for transformation inputs assert isinstance(num_trans, str), "num_trans should be a string" assert isinstance(cat_trans, str), "cat_trans should be a string" assert num_trans == "standard_scaling" or num_trans == "minmax_scaling",\ "transformation method for numeric columns can only" \ " be 'minmax_scaling' or 'standard_scaling'" assert cat_trans == "onehot_encoding" or cat_trans == "label_encoding",\ "transformation method for categorical columns can only be" \ " 'label_encoding' or 'onehot_encoding'" # Check train set and test set columns are the same assert np.array_equal(X_train.columns, X_test.columns),\ "X_train and X_test must have the same columns" for key, values in column_dict.items(): for column in values: assert column in X_train.columns,\ "columns in dictionary must be in dataframe" numeric = column_dict['numeric'] categorical = column_dict['categorical'] if cat_trans == 'onehot_encoding': if num_trans == "standard_scaling": preprocessor = ColumnTransformer(transformers=[ ("stand_scaler", StandardScaler(), numeric), ("ohe", OneHotEncoder(drop='first'), categorical) ], sparse_threshold=0) if num_trans == "minmax_scaling": preprocessor = ColumnTransformer(transformers=[ ("minmax_scaler", MinMaxScaler(), numeric), ("ohe", OneHotEncoder(drop='first'), categorical) ], sparse_threshold=0) # print(2) # Applying transformations to training data set X_train = pd.DataFrame( preprocessor.fit_transform(X_train), index=X_train.index, columns=numeric + list(preprocessor.named_transformers_['ohe'].get_feature_names( categorical))) # applying transformations to test set X_test = pd.DataFrame(preprocessor.transform(X_test), index=X_test.index, columns=X_train.columns) if cat_trans == "label_encoding": if num_trans == "standard_scaling": preprocessor = ColumnTransformer(transformers=[ ("stand_scaler", StandardScaler(), numeric), ("ordinal", OrdinalEncoder(), categorical) ], sparse_threshold=0) # print(3) if num_trans == "minmax_scaling": preprocessor = ColumnTransformer(transformers=[ ("minmax_scaler", MinMaxScaler(), numeric), ("ordinal", OrdinalEncoder(), categorical) ], sparse_threshold=0) # print(4) # ## Applying transformations to training data set X_train = pd.DataFrame(preprocessor.fit_transform(X_train), index=X_train.index, columns=numeric + categorical) # applying transformations to test set X_test = pd.DataFrame(preprocessor.transform(X_test), index=X_test.index, columns=X_train.columns) transformed_dict = {'X_train': X_train, 'X_test': X_test} return transformed_dict
# - use a `RandomizedSearchCV` to find the best set of hyper-parameters by # tuning the following parameters: `learning_rate`, `l2_regularization`, # `max_leaf_nodes`, and `min_samples_leaf`. # %% ordinal_encoding_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'sex' ] categories = [ data[column].unique() for column in data[ordinal_encoding_columns] ] preprocessor = ColumnTransformer( [('ordinal-encoder', OrdinalEncoder(categories=categories), ordinal_encoding_columns)], remainder='passthrough', sparse_threshold=0) model = Pipeline([('preprocessor', preprocessor), ('gbrt', HistGradientBoostingClassifier(max_iter=50))]) param_distributions = { 'gbrt__learning_rate': expon(loc=0.001, scale=0.5), 'gbrt__l2_regularization': uniform(loc=0, scale=0.5), 'gbrt__max_leaf_nodes': randint(5, 30), 'gbrt__min_samples_leaf': randint(5, 30) } model_grid_search = RandomizedSearchCV(model, param_distributions=param_distributions, n_iter=10,
def __init__(self): self.age = Pipeline([("scale", MinMaxScaler()), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True))]) self.fnlwgt = Pipeline([("scale", MinMaxScaler()), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True))]) self.education_num = Pipeline([("scale", MinMaxScaler()), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True))]) self.capital_net = Pipeline([("scale", MinMaxScaler()), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True))]) self.hours_per_week = Pipeline([("scale", MinMaxScaler()), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True))]) self.workclass = Pipeline([("encode", OrdinalEncoder(dtype=np.int)), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True)), ("scale", MinMaxScaler())]) self.marital_status = Pipeline([("encode", OrdinalEncoder(dtype=np.int)), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True)), ("scale", MinMaxScaler())]) self.occupation = Pipeline([("encode", OrdinalEncoder(dtype=np.int)), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True)), ("scale", MinMaxScaler())]) self.relationship = Pipeline([("encode", OrdinalEncoder(dtype=np.int)), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True)), ("scale", MinMaxScaler())]) self.race = Pipeline([("encode", OrdinalEncoder(dtype=np.int)), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True)), ("scale", MinMaxScaler())]) self.sex = Pipeline([("encode", OrdinalEncoder(dtype=np.int)), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True)), ("scale", MinMaxScaler())]) self.native_country = Pipeline([("encode", OrdinalEncoder(dtype=np.int)), ("input", SimpleImputer(strategy="constant", fill_value=-1, add_indicator=True)), ("scale", MinMaxScaler())])
for i in range(len(agg_labels)): if agg_labels[i] != focus_label: agg_labels[i] = "OTHER" print(agg_labels) return agg_labels """For *AG: only NAG, CAG and OAG focus labels are valid For *GEN: only NGEN and GEN focus labels are valid, but the redefining of labels might not be necessary""" focus_label = 'CAG' agg_labels = redifine_labels(agg_labels, focus_label) from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder() agg_labels_encoded = ordinal_encoder.fit_transform(agg_labels) #%% print(agg_labels_encoded[:10]) print(ordinal_encoder.categories_) #%% from pprint import pprint from time import time from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import NuSVC
# models will be covered in more detail in a future module. # # For tree-based models, the handling of numerical and categorical variables is # simpler than for linear models: # * we do **not need to scale the numerical features** # * using an **ordinal encoding for the categorical variables** is fine even if # the encoding results in an arbitrary ordering # # Therefore, for `HistGradientBoostingClassifier`, the preprocessing pipeline # is slightly simpler than the one we saw earlier for the `LogisticRegression`: # %% from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.preprocessing import OrdinalEncoder categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) preprocessor = ColumnTransformer( [('categorical', categorical_preprocessor, categorical_columns)], remainder="passthrough") model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) # %% [markdown] # Now that we created our model, we can check its generalization performance. # %% # %%time _ = model.fit(data_train, target_train) # %%
def test(housing): # 查看数据的基本情况,可以结合可视化 housing.info() housing.describe() # 数据呈现长尾分布 # housing.hist(bins=50, figsize=(20, 15)) train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) # 收入转化为标签 housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5]) # housing["income_cat"].hist() # 分割数据 split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] for set_ in (strat_train_set, strat_test_set): set_.drop("income_cat", axis=1, inplace=True) housing = strat_train_set.copy() ''' # 注意这里绘图时通过x,y指定对应列 # housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1) # 半径s人口,颜色c房价 housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"] / 100, label="population", figsize=(10, 7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, ) plt.legend() ''' # 相关性 corr_matrix = housing.corr() corr_matrix["median_house_value"].sort_values(ascending=False) attributes = [ "median_house_value", "median_income", "total_rooms", "housing_median_age" ] # scatter_matrix(housing[attributes], figsize=(12, 8)) # housing.plot(kind="scatter", x="median_income", y="median_house_value",alpha=0.1) housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() # 数据清洗 # NA 插入数据 housing.dropna(subset=["total_bedrooms"]) # option 1 housing.drop("total_bedrooms", axis=1) # option 2 median = housing["total_bedrooms"].median() # option 3 housing["total_bedrooms"].fillna(median, inplace=True) # sklearn提供的填充空值方法 imputer = SimpleImputer(strategy="median") housing_num = housing.drop("ocean_proximity", axis=1) imputer.fit(housing_num) imputer.statistics_ X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) # 编码,OrdinalEncoder相当于LabelEncoder的矩阵版,能对多个特征同时编码 housing_cat = housing[["ocean_proximity"]] ordinal_encoder = OrdinalEncoder() housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # ColumnTransformer对pandas每列操作 # 注意这里list(housing_num) 返回的是housing_num的列名 num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] # 注意这里传入了dataframe的列名(属性名),使Transformer进行相应的转换 full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) # 这里返回的onehot+连续值使用的是非稀疏举证, 将每个类编都转换成了0/1两种值的列, # 可以分开转换然后用sparse.hstack组装起来,titanic有用过 housing_prepared = full_pipeline.fit_transform(housing) lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) print("Predictions:", lin_reg.predict(some_data_prepared)) print("Labels:", list(some_labels)) housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) tree_reg = DecisionTreeRegressor() tree_reg.fit(housing_prepared, housing_labels) housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) # 这里的损失是0,因为没有测试集,决策树严重的过拟合了 print(tree_rmse) # 采用K折验证,决策树的表现不如之前,cv分成几份 scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) tree_rmse_scores = np.sqrt(-scores) print(tree_rmse_scores) # cross_val_score的评估标准是与正常loss相反 lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-lin_scores) display_scores(lin_rmse_scores) forest_reg = RandomForestRegressor() forest_reg.fit(housing_prepared, housing_labels) forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-forest_scores) display_scores(forest_rmse_scores) # 超参数的自动搜索 param_grid = [ { 'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8] }, { 'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4] }, ] forest_reg = RandomForestRegressor() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True) grid_search.fit(housing_prepared, housing_labels) cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) final_model = grid_search.best_estimator_ X_test = strat_test_set.drop("median_house_value", axis=1) y_test = strat_test_set["median_house_value"].copy() X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) print(final_rmse) # 95置信区间 # 一般的t统计量写成t=(估计值-假设值)/标准误,它服从自由度为(n-2)的t分布 confidence = 0.95 squared_errors = (final_predictions - y_test)**2 np.sqrt( stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))
# # df = df[df['APIKEY'].isin(api_list)] df_train['LABEL'] = df_train['LABEL'].apply(lambda x: 0 if x == 'NOT ANAMOLY' else 1) df_test['LABEL'] = df_test['LABEL'].apply(lambda x: 0 if x == 'NOT ANAMOLY' else 1) cat = ['APIKEY', 'DAY', 'TIMEBIN'] cont = list(set(df_train.columns) - set(cat) - set(['ANAMOLYDISTNUM'])) ## transformations of df_train df_cat_train = df_train[cat] df_cont_train = df_train[cont] enc = OrdinalEncoder() cat_transformed = enc.fit_transform(df_cat_train) df_cat_train = pd.DataFrame(cat_transformed.astype(int), columns=df_cat_train.columns) df = pd.DataFrame() for col in df_cat_train.columns: df[col] = df_cat_train[col] for col in df_cont_train.columns: df[col] = df_cont_train[col] df = df.dropna() df_train = df[list(set(df.columns) - set(['LABEL']))] ## transformations of df_test
KNN = KNeighborsClassifier(n_neighbors = 4).fit(train[["price","latitude"]],train["room_type"]) # In[37]: # predict new data newdata = KNN.predict([[23,1.45],[18,1.31]]) print(newdata) print() csf = KNN.predict(test[["price","latitude"]]) accuracy = accuracy_score(test["room_type"],csf) array = data[['room_type']] array = OrdinalEncoder().fit_transform(array) print("ACC : %.2f"%accuracy) # In[25]: n = 30 accuracy = np.zeros((n-1)) for i in range(1, n): KNN = KNeighborsClassifier(n_neighbors = i).fit(train[["price", "latitude"]], train["room_type"]) classification = KNN.predict(test[["price", "latitude"]]) accuracy[i - 1] = accuracy_score(test["room_type"], classification) print("Best ACC : %.2f" % accuracy.max(), ", with k = ", accuracy.argmax() + 1)
def encode_categoricals( data: pd.DataFrame, group_cols: List[str]) -> (pd.DataFrame, OrdinalEncoder): enc = OrdinalEncoder() data[group_cols] = enc.fit_transform(data[group_cols].values) return data, enc
# The above graph shows the age of the passengers, their class and whether they surivived. From the graph, # is noticed that there is a lower proportion of thrid class survivors than 1st and 2nd class. # ## Step 4: Pre-Processing the Data # # The data with words, such as the gender/sex, is converted to numbers. # The data is then standardized and the features are scaled. # Logistic regression is then used to determine which features are the most useful in predicting survival. # In[12]: from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder() Gender_encoded = ordinal_encoder.fit_transform(X_train[["Sex"]]) Gender_encoded[:5] X_train["Sex"] = Gender_encoded X_train # In[13]: from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X_train[["Sex","Age","Siblings/Spouses Aboard","Parents/Children Aboard","Fare","Pclass"]])
data = df.drop(columns=target_name) df_train, df_test, target_train, target_test = train_test_split( data, target, random_state=42) from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'sex' ] categories = [data[column].unique() for column in data[categorical_columns]] categorical_preprocessor = OrdinalEncoder(categories=categories) preprocessor = ColumnTransformer( [('cat-preprocessor', categorical_preprocessor, categorical_columns)], remainder='passthrough', sparse_threshold=0) from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import make_pipeline model = make_pipeline(preprocessor, HistGradientBoostingClassifier(random_state=42)) # %% [markdown] # TODO: write your solution here
verbs = textfeatures(verbs) named_ents = textfeatures(named_ents) adjs = textfeatures(adjs) # TFIDF from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() nouns = pd.DataFrame(tfidf.fit_transform(nouns).toarray()) # verbs = pd.DataFrame(tfidf.fit_transform(verbs).toarray()) # named_ents = pd.DataFrame(tfidf.fit_transform(named_ents).toarray()) # adjs = pd.DataFrame(tfidf.fit_transform(adjs).toarray()) # noun_phrases = pd.DataFrame(tfidf.fit_transform(noun_phrases).toarray()) ## ordinal encoder from sklearn.preprocessing import OrdinalEncoder labels = pd.DataFrame(OrdinalEncoder().fit_transform(labels.to_numpy().reshape( -1, 1))) print("encoded") ## joining X = pd.concat([ count_nouns, count_named_ents, count_verbs, count_adjs, count_noun_phrases, unique_words, stopwords, articles, punc, mean_word_len, non_voc,
drops.append(col) # print(col, '=> min:', min(dataset[col].values), '-- max:', max(dataset[col].values), end='') # print(' -- values count', len(dataset[col].value_counts())) elif len(dataset[col].value_counts()) == 2: bins.append(col) else: cats.append(col) else: dataset.drop(drops, axis=1, inplace=True) cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value=-99)) # This is for training ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')) # This is for testing num_si_step = ('si', SimpleImputer(strategy='constant')) sc_step = ('sc', StandardScaler()) bin_oe_step = ('le', OrdinalEncoder()) bin_si_step = ('si', SimpleImputer(strategy='most_frequent')) cat_pipe = Pipeline([cat_si_step, ohe_step]) num_pipe = Pipeline([num_si_step, sc_step]) bin_pipe = Pipeline([bin_si_step,]) transformers = [ ('cat', cat_pipe, []), ('num', num_pipe, cats), ('bin', bin_pipe, bins), ] ct = ColumnTransformer(transformers=transformers) # X_transformed = ct.fit_transform(dataset) # print(X_transformed)
num for num in X_test.columns if X_test[num].dtype in ['int64', 'float64'] ] cat_col_train = [cat for cat in X.columns if X[cat].dtype == 'object'] cat_col_test = [cat for cat in X_test.columns if X_test[cat].dtype == 'object'] X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1) cat_col = ['Sex'] X_test2 = X_test.copy() X_train2 = X_train.copy() X_valid2 = X_valid.copy() encoder = OrdinalEncoder() X_test2[cat_col] = encoder.fit_transform(X_test[cat_col]) X_train2[cat_col] = encoder.fit_transform(X_train[cat_col]) X_valid2[cat_col] = encoder.transform(X_valid[cat_col]) X_test1 = X_test2.copy() X_train1 = X_train2.copy() X_valid1 = X_valid2.copy() num = ['Fare'] simple = SimpleImputer() X_train1[num] = pd.DataFrame(simple.fit_transform(X_train2[num])) X_valid1[num] = pd.DataFrame(simple.transform(X_valid2[num])) X_test1[num] = pd.DataFrame(simple.fit_transform(X_test2[num])) X_test1[num].columns = X_test[num].columns X_train1[num].columns = X_train[num].columns X_valid1[num].columns = X_valid[num].columns
def _encode_categorical_values(features: pd.DataFrame, category_names: Iterable[str], is_ohe: bool = False) -> Tuple[pd.DataFrame, Iterable[str]]: encoder = OneHotEncoder(dtype='uint8', sparse=False) if is_ohe else OrdinalEncoder(dtype='uint8') mask_category = features.columns.isin(category_names)
#input variables encoding # float variables: NEP_score, GEK_score # categorical variables: Sex, Age, Marital_status, Place_of_residence, Education_level, Current_occupation, Monthly_net_income, # Household_size, Financial_situation, Satisfacion_with_life, Diet, Social_media, Religious_practices, Next_election # numeric features - standarization numeric_features = ['NEP_score', 'GEK_score'] #change integer to float for f in numeric_features: data_df[f] = data_df[f].astype(np.float64) scaler = StandardScaler() ordinal = OrdinalEncoder() onehot = OneHotEncoder() ################zamienić tę część jakoś tak, żeby nie trzeba było tworzyć Pipeline z modelem (tj. Pipeline może co najwyżej przerobić ###############zmienne) albo żeby w pętlach się zamieniały czy coś numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler())]) # categorical features - transformation into binary categorical_features = ['Sex', 'Marital_status', 'Current_occupation', 'Diet', 'Religious_practices', 'Next_election'] categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore'))])
def eval_wrapper(setting_name, proj_root, train_files, test_files=[], label_column='bin_label', text_vars=[], categ_vars=[], num_vars=[], bin_vars=[], nn_settings=None): print(f"/// New config run: '{setting_name}' ///") # > Load data # |- Train Data times = {} times['start'] = time.time() print("> Loading TRAINING data...") train_loader = BalancedData(proj_root=proj_root, file_paths=train_files) train_loader.load_data() train_df = train_loader.get_all_data_df() train_df['bin_label'] = train_df.apply(lambda x: NNA.binarize_label(x), axis=1) times['train-data-loaded'] = time.time() # |- Test Data if len(test_files) > 0: print("> Loading TEST data...") test_loader = BalancedData(proj_root=proj_root, file_paths=test_files) test_loader.load_data() test_df = test_loader.get_all_data_df() test_df['bin_label'] = test_df.apply(lambda x: NNA.binarize_label(x), axis=1) else: print("> Splitting df into test and train set.") train_df, test_df = train_test_split(train_df, test_size=0.3, random_state=42) train_df = train_df.copy() # Avoid SettingWithCopy Warning from pandas test_df = test_df.copy() times['test-data-loaded'] = time.time() N_CLASSES = len(train_df[label_column].unique()) print(f" Found {N_CLASSES} classes to predict.") # > Preprocess data # |- Encode labels print("> Encoding labels...") oe = OrdinalEncoder() #oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) np_labels = train_df[label_column].to_numpy().reshape(-1, 1) oe.fit(np_labels) train_df[label_column] = oe.transform(np_labels) test_df[label_column] = oe.transform( test_df[label_column].to_numpy().reshape(-1, 1)) test_df.head() # |- Create TF datasets print("> Creating tf.DataSets...") train_ds = NNA.df_to_dataset(train_df, label_column, batch_size=128) test_ds = NNA.df_to_dataset(test_df, label_column, batch_size=128) times['data-ready'] = time.time() # > Setup model print("> Setting up model...") # |- Combine features encoded_features, all_inputs = NNA.combine_input_pipelines( categ_vars, text_vars, num_vars, bin_vars, train_ds, nn_settings) # |- Create model model = NNA.create_model(encoded_features, all_inputs, N_CLASSES, n_units=nn_settings['classifier']['n_units'], n_layers=nn_settings['classifier']['n_layers'], dropout=nn_settings['classifier']['dropout']) times['model-ready'] = time.time() # > Train model print("> Training model...") NNA.fit_model(model, train_ds, test_ds, epochs=nn_settings['fitting']['n_epochs'], early_stopping=True) times['model-trained'] = time.time() # > Save model print("> Saving model...") model.save(path.join(proj_root, "models", setting_name)) times['model-saved'] = time.time() # > Eval model on test data print("> Evaluating model on test data...") eval_dict = NNA.create_eval_dict(test_df, label_column, oe, model) times['model-tested'] = time.time() eval_dict['times'] = times # > Save eval metrics eval_path = path.join(proj_root, "models", f"{setting_name}_eval.json") print(f"> Saving eval metrics under {eval_path}...") with open(eval_path, "w") as f: json.dump(eval_dict, f, indent=4, default=convert) print(f"## Work done for current setting '{setting_name}' ##") return
def prepare_inputs (X_train, X_test): oe = OrdinalEncoder() oe.fit (X_train) X_train_enc = oe.transform(X_train) X_test_enc = oe.transform(X_test) return X_train_enc, X_test_enc
def encode_catogory_features(X_train, X_valid, columns): oe = OrdinalEncoder() oe.fit(X_train[columns]) X_train.loc[:, columns] = oe.transform(X_train[columns]) X_valid.loc[:, columns] = oe.transform(X_valid[columns]) return X_train, X_valid
# %% [markdown] # ## Strategies to encode categories # # ### Encoding ordinal categories # # The most intuitive strategy is to encode each category with a different # number. The `OrdinalEncoder` will transform the data in such manner. # We will start by encoding a single column to understand how the encoding # works. # %% from sklearn.preprocessing import OrdinalEncoder education_column = data_categorical[["education"]] encoder = OrdinalEncoder() education_encoded = encoder.fit_transform(education_column) education_encoded # %% [markdown] # We see that each category in `"education"` has been replaced by a numeric # value. We could check the mapping between the categories and the numerical # values by checking the fitted attribute `categories_`. # %% encoder.categories_ # %% [markdown] # Now, we can check the encoding applied on all categorical features. # %%
reference_variable=config.model_config.drop_features, ), ), ( "rare_label_encoder", RareLabelCategoricalEncoder( tol=config.model_config.rare_label_tol, n_categories=config.model_config.rare_label_n_categories, variables=config.model_config.categorical_vars, ), ), ( "categorical_encoder", pp.SklearnTransformerWrapper( variables=config.model_config.categorical_vars, transformer=OrdinalEncoder(), ), ), ( "drop_features", pp.DropUnecessaryFeatures( variables_to_drop=config.model_config.drop_features, ), ), ( "gb_model", GradientBoostingRegressor( loss=config.model_config.loss, random_state=config.model_config.random_state, n_estimators=config.model_config.n_estimators, ),
plot_scaling_result(data, stand_scaled, 'Standard Scaling', (-7, 7)) # %% from sklearn.preprocessing import RobustScaler robust_scaled = RobustScaler().fit_transform(data) plot_scaling_result(data, robust_scaled, 'Robust Scaling', (-7, 7)) # %% property_type = np.array(['House', 'Unit', 'Townhouse', 'House', 'Unit']).reshape(-1, 1) # %% from sklearn.preprocessing import OrdinalEncoder enc = OrdinalEncoder().fit(property_type) labels = enc.transform(property_type) labels.flatten() # %% from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder(sparse=False).fit(property_type) one_hots = enc.transform(property_type) one_hots # %% n_rooms = np.array([1, 2, 1, 4, 6, 7, 12, 20]) pd.cut(n_rooms, bins=[0, 3, 8, 100], labels=["small", "medium", "large"]) # %%
def noprep(dataset, dirt, numeric_features, categorical_features, delim=',', indexdrop=False): index_features = ['_dmIndex_', '_PartInd_'] data = pd.read_csv(dirt + dataset + '.csv', delimiter=delim) # panda.DataFrame print(data.columns) data = data.astype({'_dmIndex_': 'int', '_PartInd_': 'int'}) numeric_features = list( set(data.select_dtypes(include=["number"])) - set(index_features) - set(['y'])) categorical_features = list( set(data.select_dtypes(exclude=["number"])) - set(['y'])) ############################### index_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=-1))]) y_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant',fill_value=-1)),\ ('orden', OrdinalEncoder())]) numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]) categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\ ('onehot', OneHotEncoder(sparse=False))]) preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),\ ('cat', categorical_transformer, categorical_features), ('y',y_transformer,['y']),('index',index_transformer, index_features)]) ###################################################################### data = preprocessor.fit_transform(data) data = pd.DataFrame(data) col = data.columns.values print(col) X = data.drop(col[-3:], axis=1) X_train = data[data[col[-1]] > 0].drop( col[-3:], axis=1) #pd.DataFrame(X).to_csv('X_vanilla.csv') X_test = data[data[col[-1]] == 0].drop( col[-3:], axis=1) #pd.DataFrame(X).to_csv('X_vanilla.csv') print(data.shape) #################################################################### #y= data["y"] #lb = preprocessing.LabelBinarizer() #y= lb.fit_transform(y) #data["y"]=data.where(data["y"]=='yes',1) #data["y"]=data.where(data["y"]=='no',0) y = data[col[-3]] y_train = data[data[col[-1]] > 0][col[-3]] y_test = data[data[col[-1]] == 0][col[-3]] ########################################################## ################################################################## feat_type = [] #dict() xcol = X.columns.values for cl in xcol: if cl in categorical_features: feat_type.append(1) else: feat_type.append(0) features = numeric_features + categorical_features #X_train, X_test, y_train, y_test = \ #sklearn.model_selection.train_test_split(X, y,test_size=0.2, random_state=1) return data, X, y, X_train, y_train, X_test, y_test, feat_type, features
--- # Análise Predititiva Vamos inciar agora a análise preditiva, utilizando técnicas de aprendizado de máquina, para estimar se um passageiro poderia sobreviver ou não a esta tragédia. O algoritmo selecionado para esta análise é o Random Forest. Optei por este algoritmo por ele ser simples de implementar, trabalhar bem com classificação, e também pelo fato de o mesmo apresentar uma "explicação" sobre as decisões tomadas e as features mais importantes (podemos visualizar a 'feature importance' e ver quais features tem mais peso na tomada de decisão). Para começar, vamos converter os dados categóricos em numéricos, pois máquinas gostam de números! """ #DATASET/TREINO - Convertendo dados categóricos em numéricos df_treino['sexo'] = OrdinalEncoder().fit_transform( df_treino['sexo'].values.reshape((-1, 1))) df_treino['poltrona'] = OrdinalEncoder().fit_transform( df_treino['poltrona'].values.reshape((-1, 1))) df_treino['local_de_embarque'] = OrdinalEncoder().fit_transform( df_treino['local_de_embarque'].values.reshape((-1, 1))) df_treino['sobrevivente'] = OrdinalEncoder().fit_transform( df_treino['sobrevivente'].values.reshape((-1, 1))) # Convertendo float para int # # A idade será mantida como float, pois idades estimadas estão no formato xx.5 df_treino['sexo'] = df_treino['sexo'].apply(lambda x: int(x)) df_treino['poltrona'] = df_treino['poltrona'].apply(lambda x: int(x)) df_treino['local_de_embarque'] = df_treino['local_de_embarque'].apply( lambda x: int(x)) df_treino['sobrevivente'] = df_treino['sobrevivente'].apply(lambda x: int(x))