def process( naive_file, treated_file, metadata_file, resistance_files, outfile, subtype="All", truncate=[41, 235], ): print("reading sequences and metadata") raw_sequences, consensus = reader(naive_file, treated_file, truncate) metadata = read_metadata(metadata_file) print(f"choosing {subtype} subtype(s)") chosen_sequences, dataset_subtypes = choose_subtype( raw_sequences, metadata, subtype) print("Filling with consensus AAs") AA_sequences = fill_consensus_AAs(chosen_sequences, consensus) freqs = get_single_AA_freqs(AA_sequences.drop("label", axis=1)) single_AA_sequences = get_single_AAs(AA_sequences, freqs) print("OneHot encoding") columns_to_encode = single_AA_sequences.columns.drop("label") encoder = OneHotEncoder(use_cat_names=True, handle_unknown="ignore", cols=columns_to_encode.tolist()) encoded_sequences = encoder.fit_transform(single_AA_sequences) print("removing consensus features") features_to_remove = get_features_to_remove(dataset_subtypes) total_sequences = encoded_sequences.drop(columns=features_to_remove, errors="ignore") total_sequences["encoded_label"] = total_sequences["label"].apply({ "treated": 1, "naive": 0 }.get) drms = get_all_DRMs() total_sequences["hasDRM"] = (total_sequences.filter( drms, axis=1).any(axis=1).astype(int)) total_sequences["is_resistant"] = (total_sequences[[ "encoded_label", "hasDRM" ]].any(axis=1).astype(int)) print("getting resistance scores") resistance_scores = get_resistance_scores(resistance_files) print("saving dataset to disk") joined = total_sequences.join(resistance_scores) joined.to_csv(outfile, sep="\t", index=True, header=True)
def one_hot_encoded_result(df_orig): df = df_orig.copy(deep=True) one_hot_enc = OneHotEncoder(cols=['ordinal_result'], use_cat_names=True) one_hot_cols = one_hot_enc.fit_transform(df[['ordinal_result']]) new_one_hot_col_names = [col[:-2] for col in one_hot_cols.columns] mapping_dict = { old: new for old, new in zip(one_hot_cols.columns, new_one_hot_col_names) } one_hot_cols.rename(columns=mapping_dict, inplace=True) one_hot_cols = one_hot_cols[sorted(one_hot_cols.columns)] df_with_new_cols = pd.concat([df, one_hot_cols], axis=1) return df_with_new_cols
def one_hot_encode(self,data): """ 複数のカテゴリ変数をベクトル化して、それぞれ変換規則を保存する関数です。 ベクトル化したデータセットを返します。 変換規則はenc_dictに保存されています。 :param data: 学習で用いるデータセット(Dataset型の属性dataを受け取る) """ #self.enc_dict={} oe=OneHotEncoder(cols=self.columns,handle_unknown="inpute") oe_data=oe.fit_transform(data) self.model=oe #oe_data=oe_data.ix[:,org_order] return oe_data
def _encode_categories(self): """ This private method stands for encoding categorical variables. Label encoding used for ordinal categories and one-hot encoding used for nominal categories. """ logging.info(f'#{self._index()} - Encoding categorical columns...') # get column names for categorical and numerical features categorical_vars = self.X.select_dtypes(include='object').columns numerical_vars = self.X.columns.difference(categorical_vars) ordinal = pd.Index([ 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC' ]) nominal = categorical_vars.difference(ordinal) standard_mapping = { 'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5 } mapping_for_ordinals = [{ 'col': column, 'mapping': standard_mapping } for column in ordinal] x_num = self.X[numerical_vars] x_test_num = self.X_test[numerical_vars] # one hot encode categorical columns one_hot_encoder = OneHotEncoder(use_cat_names=True) label_encoder = OrdinalEncoder(drop_invariant=True, mapping=mapping_for_ordinals, handle_unknown='error') x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal]) x_cat_ord = label_encoder.fit_transform(self.X[ordinal]) x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal]) x_test_cat_ord = label_encoder.transform(self.X_test[ordinal]) self.X = x_num.join(x_cat_ord).join(x_cat_nom) self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom) logging.info(f'#{self._step_index} - DONE!')
def preproc_data(data, features): ''' Simple preproc data:* LabelEncoded target * One Hot Encoding cat_features * Clean Nans as .median() * split data on X, y data: pd.DataFrame() cat_features: list() # categorical variables in df ''' # LabelEncoded Target for i in features.items(): if 'target' in i: target_col = i[0] data[target_col] = data[target_col].astype('category').cat.codes y = data[target_col] X = data.drop([target_col], axis=1) cat_features = [] for feature in features.items(): if ('categorical' in feature) and (X[feature[0]].nunique(dropna=False) > 2): cat_features.append(feature[0]) # LabelEncoded Binary Features for feature in X.columns: if (X[feature].nunique(dropna=False) < 3): X[feature] = X[feature].astype('category').cat.codes if len(cat_features) > 0: if feature in cat_features: cat_features.remove(feature) # One Hot Encoding if len(cat_features) > 0: encoder = OneHotEncoder(cols=cat_features, drop_invariant=True) X = encoder.fit_transform(X) # Nans nan_columns = list(X.columns[X.isnull().sum() > 0]) if nan_columns: for nan_column in nan_columns: X[nan_column + 'isNAN'] = pd.isna(X[nan_column]).astype('uint8') X[nan_columns].fillna(X[nan_columns].median(), inplace=True) return (X, y)
def main(): # Preprocess the data # start your code here # Load data data = pd.read_csv("bank.csv") # Fix typo in column name data.rename(columns={"subcribed": "subscribed"}, inplace=True) # Encoding features data = data.replace({"yes": 1, "no": 0}) ohe = OneHotEncoder( cols=["job", "marital", "education", "contact", "month", "poutcome"], use_cat_names=True, return_df=True, ) data = ohe.fit_transform(data) # print(data.head()) # Get features and target X = data.drop(columns=["subscribed"]) y = data["subscribed"] # Split training and testing data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100) # end your code here # print( # "\n\nDecision Tree: -------------------------------------------------------------------------\n\n" # ) # # start your code here # tree_classifier = DecisionTreeClassifier( # max_depth=4, # max_leaf_nodes=4, # random_state=100, # ) # tree_classifier.fit(X_train, y_train) # y_pred_tree = tree_classifier.predict(X_test) # evaluate(y_test, y_pred_tree) # # feature_imp_tree = pd.Series( # # tree_classifier.feature_importances_, index=X_train.columns # # ).sort_values(ascending=False)[:10] # # print(feature_imp_tree) # # plt.figure(figsize=(20, 10)) # # plot_tree( # # tree_classifier, # # feature_names=X_train.columns, # # class_names=["no", "yes"], # # rounded=True, # # ) # # plt.savefig("decision_tree.svg", bbox_inches="tight") # # plt.show() # # end your code here # print( # "\n\nRandom Forest: -------------------------------------------------------------------------\n\n" # ) # # start your code here # rf_classifier = RandomForestClassifier( # # bootstrap=False, # criterion="entropy", # max_depth=9, # max_leaf_nodes=21, # min_samples_leaf=5, # random_state=100, # ) # rf_classifier.fit(X_train, y_train) # y_pred_rf = rf_classifier.predict(X_test) # evaluate(y_test, y_pred_rf) # feature_imp_rf = pd.Series( # rf_classifier.feature_importances_, index=X_train.columns # ).sort_values(ascending=False)[:10] # print(feature_imp_rf) # # end your code here print( "\n\nXGBoost: -------------------------------------------------------------------------\n\n" ) # start your code here xgb_classifier = xgb.XGBClassifier( objective="binary:logistic", learning_rate=0.1, max_depth=3, min_child_weight=5, use_label_encoder=False, colsample_bytree=0.3, ) xgb_classifier.fit(X_train, y_train) y_pred_xgb = xgb_classifier.predict(X_test) evaluate(y_test, y_pred_xgb)
score = roc_auc_score(y_test, y_pred) return score ######### Creating objects for 2 classification models. logit = LogisticRegression(random_state=SEED) rf = RandomForestClassifier(random_state=SEED) ################################################################################################### ######### Apply One Hot Encoding from category_encoders import OneHotEncoder onehot_enc = OneHotEncoder(cols=X_Columns) onehot_enc.fit(X_train, y_train) print('Original number of features: \n', X_train.shape[1], "\n") data_ohe_train = onehot_enc.fit_transform(X_train) data_ohe_test = onehot_enc.transform(X_test) print('Features after OHE: \n', data_ohe_train.shape[1]) ######### Logistic Regression onehot_logit_score = get_score(logit, data_ohe_train, y_train, data_ohe_test, y_test) print('Logistic Regression score with One hot encoding:', onehot_logit_score) ######### Random Forest onehot_rf_score = get_score(rf, data_ohe_train, y_train, data_ohe_test, y_test) print('Random Forest score with One hot encoding:', onehot_logit_score) ################################################################################################### ######### Apply Hashing Encoding from category_encoders import HashingEncoder
def aggregate_per_time_interval(date_interval): ### Importing customer_data = pd.read_csv('Data/olist_customers_dataset.csv') geolocation_data = pd.read_csv('Data/olist_geolocation_dataset.csv') order_items_data = pd.read_csv('Data/olist_order_items_dataset.csv') order_payments_data = pd.read_csv('Data/olist_order_payments_dataset.csv') order_reviews_data = pd.read_csv('Data/olist_order_reviews_dataset.csv') olist_order_data = pd.read_csv('Data/olist_orders_dataset.csv') olist_products_data = pd.read_csv('Data/olist_products_dataset.csv') olist_sellers_data = pd.read_csv('Data/olist_sellers_dataset.csv') olist_product_category_data = pd.read_csv( 'Data/product_category_name_translation.csv') ### Converts column of interest to datetime format olist_order_data['order_purchase_timestamp'] = pd.to_datetime( olist_order_data['order_purchase_timestamp']) ### Keeps dates that are between the given date limits mask = (olist_order_data['order_purchase_timestamp'] >= date_interval[0]) & (olist_order_data['order_purchase_timestamp'] < date_interval[1]) olist_order_data = olist_order_data[mask] ### Rest of function is the same as in first notebook of the project ### Olist_products_dataset merge to get product category name in english olist_products_data = olist_products_data.merge( olist_product_category_data, how='left', on='product_category_name') ### Merge order items dataset with products dataset order_items_data = order_items_data.merge(olist_products_data, how='left', on='product_id') ### Count number of occurrences for each order ID count = order_items_data.groupby('order_id').count().iloc[:, 0].rename( 'n_items per order') ### Numeric data will be aggregated by mean num_order_items_data = pd.concat([ order_items_data['order_id'], order_items_data.select_dtypes('float64') ], axis=1) num_order_items_data = num_order_items_data.groupby('order_id').mean() ### Aggregate each order's products category names by its most frequent value cat_order_items_data = order_items_data[[ 'order_id', 'product_category_name_english' ]].groupby('order_id').agg(lambda g: g.value_counts().index[0] if np.any(g.notnull()) else np.nan) order_items_data = pd.concat( [count, num_order_items_data, cat_order_items_data], axis=1) olist_order_data = olist_order_data.merge(order_items_data, how='left', on='order_id') ### Number of payments ###1. Count the number ### Count number of payments per order count = order_payments_data.groupby('order_id').count().iloc[:, 0].rename( 'n_payments per order') ### One hot encode payment type feature enc = OneHotEncoder(cols=['payment_type'], use_cat_names=True) order_payments_data = enc.fit_transform(order_payments_data) order_payments_data = order_payments_data.drop('payment_type_not_defined', axis=1) order_payments_data = order_payments_data.groupby('order_id').mean() order_payments_data = pd.concat([order_payments_data, count], axis=1) olist_order_data = olist_order_data.merge(order_payments_data, how='left', on='order_id') ### Number of reviews per order count = order_reviews_data.groupby('order_id').count().iloc[:, 0].rename( 'n_reviews per order').astype('float64') order_reviews_data = order_reviews_data[['order_id', 'review_score' ]].groupby('order_id').mean() order_reviews_data = pd.concat([count, order_reviews_data], axis=1) olist_order_data = olist_order_data.merge(order_reviews_data, how='left', on='order_id') ### Merging customer table with order tables customer_data = customer_data.merge(olist_order_data, how='inner', on='customer_id') ### Cutomer data aggregation count = customer_data.groupby( 'customer_unique_id').count().iloc[:, 0].rename('n_orders per customer') ### Numeric features aggregated by mean numeric_customer_data = pd.concat([ customer_data.select_dtypes('float64'), customer_data['customer_unique_id'] ], axis=1) numeric_customer_data = numeric_customer_data.groupby( 'customer_unique_id').mean() ### Categorical features aggregated by most frequent value cat_customer_data = customer_data[[ 'customer_unique_id', 'product_category_name_english' ]].groupby('customer_unique_id').agg(lambda g: g.value_counts().index[0] if np.any(g.notnull()) else np.nan) customer_data = pd.concat( [count, numeric_customer_data, cat_customer_data], axis=1) return customer_data
def doCleanupEncode(X, y=None, cat=None, oh=None, binary=None, loo=None, woe=None, lp_cols=None, NoData=True): from enrich import replaceCVs from enrich import one_hot_encode from category_encoders import BinaryEncoder from category_encoders import OneHotEncoder from category_encoders import WOEEncoder from category_encoders import LeaveOneOutEncoder if NoData is False: if cat is not None | oh is not None: # translate associated columns' null, NaN, blank and 9 values to zero X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0) if oh is not None: if NoData: ec = OneHotEncoder(cols=oh, use_cat_names=True, return_df=True, handle_unknown='indicator', handle_missing='indicator').fit(X) X = ec.fit_transform(X) # dropping these columns did not help performance # for o in oh: # stem = o.split("_")[1] # d1 = "L_" + stem + "_-1" # d2 = "L_" + stem + "_nan" # print("DROPPING ", d1, " ", d2, "\n") # X.drop(d1, axis=1, errors='ignore', inplace=True) # X.drop(d2, axis=1, errors='ignore', inplace=True) else: # one-hot encode, then drop 0 if created for oh_c in oh: X = one_hot_encode(X, oh_c, False) X.drop(0, axis=1, errors='ignore', inplace=True) if binary is not None: # binary encode binary columns if NoData: enc = BinaryEncoder(cols=binary, drop_invariant=True, return_df=True, handle_unknown='indicator').fit(X) X = enc.transform(X) else: enc = BinaryEncoder(cols=binary, drop_invariant=True, return_df=True).fit(X) X = enc.transform(X) if woe is not None: # use weight of evidence on woe columns for w in woe: X[w] = X[w].fillna('NoData') wenc = WOEEncoder(cols=woe).fit(X, y) X = wenc.transform(X).round(2) if loo is not None: # use leave one out on loo columns for l in loo: X[l] = X[l].fillna('NoData') lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y) X = lenc.transform(X).round(2) # Cast all to int64 # X = X.astype("int64") if lp_cols is not None: # drop least predictive X.drop(lp_cols, axis=1, errors="ignore", inplace=True) X.reset_index(drop=True, inplace=True) return X
train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') train.head() train.info() test_id = test['id'] # save for submission del train['id'] del test['id'] train['type'].unique(), train['color'].unique() sns.violinplot(x='bone_length', y='type', data=train) sns.boxplot(x='hair_length', y='type', data=train) sns.pairplot(train) from category_encoders import OneHotEncoder encoder = OneHotEncoder(cols=['color'], use_cat_names=True) train = encoder.fit_transform(train) test = encoder.fit_transform(test) train.head() from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() encoder.fit(train['type']) print(encoder.classes_) train['type_no'] = encoder.transform(train['type']) train.head() sns.heatmap(train.corr(), xticklabels=list(train), yticklabels=list(train)) target = train['type_no'] # for visualizations target_string = train['type'] # for final predictions
def oneHotEncoding(df,column): from category_encoders import OneHotEncoder encoder=OneHotEncoder(cols=[column]) df = encoder.fit_transform(df) return df
def _encode_categories(self): """ This private method stands for encoding categorical variables. Label encoding used for ordinal categories and one-hot encoding used for nominal categories. """ logging.info(f'#{self._index()} - Encoding categorical columns...') def encode(data): # encode Sex column data['Sex'] = data['Sex'] == 'male' # encode Name column name_cols = data['Name'].apply(lambda x: pd.Series( [str(x).split(",")[0], str(x).split(", ")[1].split(".")[0]], index=['Family name', 'Title'])) data = data.join(name_cols) # identify Titles with same meaning data['Title'].replace({ 'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs' }, inplace=True) # group rare Titles title_names = (data['Title'].value_counts() < 10) data['Title'] = data['Title'].apply(lambda x: 'Misc' if title_names.loc[x] else x) # create Family size and Alone column from SibSp, Parch cols data['Family size'] = data['SibSp'] + data['Parch'] + 1 data['Alone'] = data['Family size'] == 1 # make 5 equal size groups from Fares data['Fare'] = pd.qcut(data['Fare'], 5, labels=False) # make 5 groups from Ages data['Age'] = pd.cut(data['Age'], 5, labels=False) # rename columns and delete unnecessary features data = data.rename(columns={ 'Sex': 'Male', 'Fare': 'FareBins', 'Age': 'AgeBins' }) data.drop(['Name', 'SibSp', 'Parch'], axis=1, inplace=True) return data self.X = encode(self.X) self.X_test = encode(self.X_test) for col in self.X.columns: if self.X[col].dtype != 'float64': table = self.X.join(self.y)[[col, 'Survived' ]].groupby(col, as_index=False).mean() table['Survived'] = (table['Survived'] * 100).map( '{:.2f} %'.format) logging.info( f'Survival ratio by: {col}\n{table}\n{"-" * 10}\n') one_hot_encoder = OneHotEncoder(use_cat_names=True) one_hot_columns = one_hot_encoder.fit_transform( self.X[['Title', 'Embarked']]) one_hot_columns_test = one_hot_encoder.transform( self.X_test[['Title', 'Embarked']]) self.X = self.X.join(one_hot_columns) self.X_test = self.X_test.join(one_hot_columns_test) self.X.drop(['Family name', 'Title', 'Embarked'], axis=1, inplace=True) self.X_test.drop(['Family name', 'Title', 'Embarked'], axis=1, inplace=True) logging.info(f'#{self._step_index} - DONE!')
# x: data will receive all columns except the classe (income) as inplace=Flase will not drop the column, y: classes x = df.drop('income', axis=1, inplace=False) y = df.income # TRAIN and TEST DATA # split train and test 70x30 x_train, x_test, y_train, y_test = train_test_split(x, y) # select non numeric columns - decision tree #var = df.select_dtypes(include='object') # print(var.head()) df.select_dtypes(include='object') # OneHotEncoder: Categorical column to integer ohe = OneHotEncoder(use_cat_names=True) x_train = ohe.fit_transform(x_train) x_train.head() # StandardScaler - pre-processor to put numerical column in the same scale scaler = StandardScaler().fit(x_train) scaler values_scale = scaler.transform(x_train) values_scale[:10] x_train = scaler.transform(x_train) # generate the model - could be any model # instance of the classifier decision tree and train the model clf_tree = tree.DecisionTreeClassifier() clf_tree = clf_tree.fit(x_train, y_train)
class Encoder(): encode_methods = { 'OrdinalEncoder': OrdinalEncoder, 'OneHotEncoder': OneHotEncoder, 'CountEncoder': CountEncoder, 'TargetEncoder': TargetEncoder, } # spark_encode_methods = { # 'mean_encoder':, # 'target_encoder':, # 'label_encoder':, # 'onehot_encoder' # } # target_encoder,mean_encoder在编码时,不能够把训练集和验证机concat在一起进行编码 # label_encoder,onehot_encoder可以 def __init__(self, sparksess=None, logdir='/encoder', handle_unknown='-99999', save_encoder=False): self.spark = sparksess self.logdir = logdir self.save_encoder self.ordinal_encoder_features = [] self.onehot_encoder_features = [] self.count_encoder_features = [] self.target_encoder_features = [] self.ordinal_encoder = OrdinalEncoder( cols=self.ordinal_encoder_features, return_df=True, handle_unknown=handle_unknown) self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features, return_df=True, handle_unknown=handle_unknown) self.count_encoder = CountEncoder(cols=self.count_encoder_features, return_df=True, handle_unknown=handle_unknown) self.target_encoder = TargetEncoder(cols=self.target_encoder_features, return_df=True, handle_unknown=handle_unknown) def fit(self, x_train, x_val=None, y_train=None, y_val=None, method_mapper=None): """ Parameters ---------- x_train: pd.DataFrame x_val: pd.DataFrame y_train: pd.DataFrame y_val: pd.DataFrame method_mapper: dict a mapping of feature to EncodeMethod example mapping: { 'feature1': OrdinalEncoder, 'feature2': OneHotEncoder, 'feature3': CountEncoder, 'feature4': TargetEncoder, } """ for feat in method_mapper: if method_mapper[feat] == 'OrdinalEncoder': self.ordinal_encoder_features.append(feat) elif method_mapper[feat] == 'OneHotEncoder': self.onehot_encoder_features.append(feat) elif method_mapper[feat] == 'CountEncoder': self.count_encoder_features.append(feat) elif method_mapper[feat] == 'TargetEncoder': self.target_encoder_features.append(feat) else: raise ValueError( '编码方式只支持[OrdinalEncoder, OneHotEncoder, CountEncoder, TargetEncoder], 接收到%s' % feat) if self.spark is None: if len(self.ordinal_encoder_features) != 0 or len( self.onehot_encoder_features) != 0: x_whole = x_train.append(x_val) y_whole = None if not y_train is None and not y_val is None: y_whole = y_train.append(y_val) x_whole = self.ordinal_encoder.fit_transform(x_whole, y_whole) x_whole = self.onehot_encoder.fit_transform(x_whole, y_whole) x_train = x_whole[:len(x_train)] x_val = x_whole[len(x_train):] x_train = self.count_encoder.fit_transform(x_train, y_train) x_val = self.count_encoder.transform(x_val, y_val) x_train = self.target_encoder.fit_transform(x_train, y_train) x_val = self.target_encoder.transform(x_val, y_val) if self.save_encoder: self.save_encoder() return x_train, y_train, x_val, y_val def transform(self, x, y=None): x = self.ordinal_encoder.transform(x, y) x = self.onehot_encoder.transform(x, y) x = self.count_encoder.transform(x, y) x = self.target_encoder.transform(x, y) return x, y def fit_transform(self, x_train, x_val=None, y_train=None, y_val=None, method_mapper=None): """ Parameters ---------- x_train: pd.DataFrame x_val: pd.DataFrame y_train: pd.DataFrame y_val: pd.DataFrame method_mapper: dict a mapping of feature to EncodeMethod example mapping: { 'feature1': OrdinalEncoder, 'feature2': OneHotEncoder, 'feature3': CountEncoder, 'feature4': TargetEncoder, } """ self.fit(x_train, x_val, y_train, y_val, method_mapper) x_train, y_train = self.transform(x_train, y_train) if x_val is not None: x_val, y_val = self.transform(x_val, y_val) return x_train, y_train, x_val, y_val def save_encoder(self): now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) os.makedirs(os.path.join(self.logdir, now)) with open(os.path.join(self.logdir, now, 'OrdinalEncoder.pkl'), 'wb') as f: pickle.dump(self.ordinal_encoder, f) with open(os.path.join(self.logdir, now, 'OneHotEncoder.pkl'), 'wb') as f: pickle.dump(self.onehot_encoder, f) with open(os.path.join(self.logdir, now, 'CountEncoder.pkl'), 'wb') as f: pickle.dump(self.count_encoder, f) with open(os.path.join(self.logdir, now, 'TargetEncoder.pkl'), 'wb') as f: pickle.dump(self.target_encoder, f) with open( os.path.join(self.logdir, now, 'OrdinalEncoderFeatures.json'), 'w') as f: json.dump(self.ordinal_encoder_features, f) with open(os.path.join(self.logdir, now, 'OneHotEncoderFeatures.json'), 'w') as f: json.dump(self.onehot_encoder_features, f) with open(os.path.join(self.logdir, now, 'CountEncoderFeatures.json'), 'w') as f: json.dump(self.count_encoder_features, f) with open(os.path.join(self.logdir, now, 'TargetEncoderFeatures.json'), 'w') as f: json.dump(self.target_encoder_features, f) def load_encoder(self, logdir=None): with open(os.path.join(self.logdir, 'OrdinalEncoder.pkl'), 'wb') as f: pickle.dump(self.ordinal_encoder, f) with open(os.path.join(self.logdir, 'OneHotEncoder.pkl'), 'wb') as f: pickle.dump(self.onehot_encoder, f) with open(os.path.join(self.logdir, 'CountEncoder.pkl'), 'wb') as f: pickle.dump(self.count_encoder, f) with open(os.path.join(self.logdir, 'TargetEncoder.pkl'), 'wb') as f: pickle.dump(self.target_encoder, f) with open(os.path.join(self.logdir, 'OrdinalEncoderFeatures.json'), 'w') as f: json.dump(self.ordinal_encoder_features, f) with open(os.path.join(self.logdir, 'OneHotEncoderFeatures.json'), 'w') as f: json.dump(self.onehot_encoder_features, f) with open(os.path.join(self.logdir, 'CountEncoderFeatures.json'), 'w') as f: json.dump(self.count_encoder_features, f) with open(os.path.join(self.logdir, 'TargetEncoderFeatures.json'), 'w') as f: json.dump(self.target_encoder_features, f)
class CombineUmap(object) : trans_cat_names = None numeric_mapper = None cat_mapper = None intersection_mapper = None union_mapper = None contrast_mapper = None def __init__(self, num_info:Dict[str,str], cat_info:Dict[str,str], target:str,**kwargs ) -> None: super().__init__() self.num_info = num_info self.cat_info = cat_info self.target = target self.assign_num_scaler() self.assign_cat_scaler() def assign_num_scaler(self,) : self.num_method = self.num_info.get("method", None) self.num_cols = self.num_info.get("cols", []) if self.num_method is None : self.num_scaler = Empty() elif self.num_method == "RobustScaler" : self.num_scaler = RobustScaler() else : raise NotImplementedError("아직 나머지 구현 안함") def assign_cat_scaler(self,) : self.cat_method = self.cat_info.get("method", None) self.cat_cols = self.cat_info.get("cols", []) if self.cat_method is None : self.cat_encoder = Empty() elif self.cat_method == "OrdinalEncoder" : self.cat_encoder = OrdinalEncoder(cols = self.cat_cols) elif self.cat_method == "OneHotEncoder" : self.cat_encoder = OneHotEncoder(cols = self.cat_cols) else : raise NotImplementedError("아직 나머지 구현 안함") def fit(self, df:pd.DataFrame, num_kwargs={"n_neighbors":15, "random_state":42,"n_jobs":20}, cat_kwargs={"metric":"dice", "n_neighbors" : 150, "random_state" : 42,"n_jobs":20}) : if self.num_cols != [] : df = self.scale_num(df) self.numeric_mapper = umap.UMAP(**num_kwargs).fit(df[self.num_cols]) if self.cat_cols != [] : df = self.encode_cat(df) self.cat_mapper = umap.UMAP(**cat_kwargs).fit(df[self.trans_cat_names]) return self def transform(self,df:pd.DataFrame) : result = [None , None] if self.num_cols != [] : result[0] = self.num_transform(df) if self.cat_cols != [] : result[1] = self.cat_transform(df) return result def make_new_mapper(self,) : if (self.numeric_mapper is not None) & (self.cat_mapper is not None) : self.intersection_mapper = self.numeric_mapper * self.cat_mapper self.union_mapper = self.numeric_mapper + self.cat_mapper self.contrast_mapper = self.numeric_mapper - self.cat_mapper print("make new mapper 1.intersection_mapper 2.union_mapper 3.contrast_mapper") return self def num_transform(self,df:pd.DataFrame) : df = self.scale_num(df) return self.numeric_mapper.transform(df[self.num_cols]) def cat_transform(self, df:pd.DataFrame) : df = self.encode_cat(df) return self.cat_mapper.transform(df[self.trans_cat_names]) def vis_basic(self,embedding:np.array,target=None,classes=None) : _, ax = plt.subplots(1, figsize=(14, 10)) plt.scatter(*embedding.T, s=0.3, c=target, cmap='Spectral', alpha=1.0) plt.setp(ax, xticks=[], yticks=[]) if classes is not None : num_class = len(classes) cbar = plt.colorbar(boundaries=np.arange(num_class+1)-0.5) cbar.set_ticks(np.arange(num_class)) cbar.set_ticklabels(classes) plt.title('Embedded via UMAP'); return self def vis_diagnostic(self, mapper,diagnostic_type="vq") : """[summary] Args: mapper ([type]): [description] diagnostic_type (str, optional): [description]. Defaults to "vq". Returns: [type]: [description] example diagnostic_type= ["vq","local_dim","neighborhood","pca"], """ return umap.plot.diagnostic(mapper, diagnostic_type=diagnostic_type) def vis_interactive(self,mapper,**kwargs) : umap.plot.output_notebook() p = umap.plot.interactive(mapper,**kwargs) umap.plot.show(p) return self def vis_connectivity(self,mapper ,**kwargs) : #edge_bundling='hammer' return umap.plot.connectivity(mapper, show_points=True,**kwargs) def vis_points(self, mapper, values=None,**kwargs) : """[summary] Args: mapper ([type]): [description] values ([type]): [description] Returns: [type]: [description] kwargs example {"theme" : "fire","background":"black"} """ return umap.plot.points(mapper, values=values, **kwargs) def scale_num(self, df:pd.DataFrame) : df[self.num_cols]= self.num_scaler.fit_transform(df[self.num_cols]) return df def encode_cat(self,df:pd.DataFrame) : df = self.cat_encoder.fit_transform(df) if self.cat_method == "OneHotEncoder" : self.trans_cat_names = self.get_onehot_names() elif self.cat_method == "OrdinalEncoder" : self.trans_cat_names = self.get_label_names() return df def get_label_names(self,) : return self.cat_encoder.cols def get_onehot_names(self,) : onehot_names = [feature_name for feature_name in self.cat_encoder.feature_names if any([ re.search( f"^{i}_", feature_name) for i in self.cat_encoder.cols])] return onehot_names
# Convert price from object to float nyc["price"] = nyc["price"].replace('[\$,]', '', regex=True).astype(float) # Convert bathrooms, bedrooms, and beds from float to int nyc = nyc.astype({'bathrooms': int, 'bedrooms': int, 'beds': int}) # Create target vector y = nyc[target] # Create feature matrix X = nyc[features] # Instantiate and apply One Hot Encoder to room_type ohe = OneHotEncoder(use_cat_names=True) X_transform = ohe.fit_transform(X) # Instantiate model model = LinearRegression() # Fit model model_lr = model.fit(X_transform, y) def predict_price(user_input): # Store prediction user_input_ = np.array(user_input) user_input = user_input_.reshape(1, -1) prediction = model_lr.predict(user_input) return prediction
#X_train_encoded.head() X = dataset[:,0:5] y = dataset[:,5] y=np.reshape(y, (-1,1)) scaler_x = MinMaxScaler() scaler_y = MinMaxScaler() #scaler_y.fit(y) #yscale=scaler_y.transform(y) #X = StandardScaler().fit_transform(X) #y = StandardScaler().fit_transform(y.reshape(len(y),1))[:,0] X_train, X_test, y_train, y_test = train_test_split(X, y) X_train_encoded = encoder.fit_transform(X_train) scaler_x.fit(X_train_encoded) xscale=scaler_x.transform(X_train_encoded) '''model = Sequential() model.add(Dense(12, input_dim=5, kernel_initializer='normal', activation='relu')) model.add(Dense(8, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='mse', optimizer='adam', metrics=['mse','mae']) history = model.fit(X_train, y_train, epochs=150, batch_size=50, verbose=1, validation_split=0.2)''' class Item(BaseModel): """Use this data model to parse the request body JSON.""" Strain:str = Field(..., example="13-Dawgs") Type: str = Field(..., example="sativa")