def sector_filtering(portfolio): df_reduced['labels'] = OrdinalEncoder(cols=['setor']).fit_transform(df_dummy['setor']) df_reduced['labels2'] = OrdinalEncoder(cols=['de_faixa_faturamento_estimado_grupo']).fit_transform(df_dummy['de_faixa_faturamento_estimado_grupo']) X = pd.concat([df_id,df_reduced], axis='columns') # portfolio information pf_filled = X.loc[X['id'].isin(portfolio['id'].values)] # part of the market that shares the same clusters pf_out = X.loc[X['labels'].isin(list(pf_filled['labels'].unique()))] pf_out = pf_out.loc[X['labels2'].isin(list(pf_filled['labels2'].unique()))] # customer that are not yet on the company's portfolio sample = pf_filled.iloc[:,:num_components-1].sample(frac=0.7, random_state=42) # num_comp-1 for it not to account for the labels in the dot product pf_rec = pf_out.loc[~pf_out['id'].isin(sample['id'])] pf_rec = pf_rec.iloc[:,:num_components-1] # num_comp-1 for it not to account for the labels in the dot product cosine_sim = cosine_similarity(pf_rec.drop(['id'],axis='columns'),sample.drop(['id'],axis='columns')) cosine_sim = np.sum(cosine_sim, axis=1) # best results with sum. amax and mean already tested pf_rec['score'] = list(cosine_sim) # list new leads to recommend market = list(pf_rec.sort_values('score', ascending=False)['id']) test = list(pf_filled.loc[~pf_filled['id'].isin(sample['id'])]['id']) return market, test
def models_to_compare(self) -> Dict[ModelName, Dict]: lightgbm_step_categorical_features_params = f"{ModelName.LIGHTGBM.value}__{CATEGORICAL_FEATURE}" return { ModelName.CATBOOST: { TaskName.CLASSIFICATION: Pipeline([(ModelName.CATBOOST.value, CatBoostClassifier( cat_features=self.categorical_features_indices, verbose=0))]), TaskName.REGRESSION: Pipeline([(ModelName.CATBOOST.value, CatBoostRegressor( cat_features=self.categorical_features_indices, verbose=0))]) }, ModelName.LIGHTGBM: { TaskName.CLASSIFICATION: Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()), (ModelName.LIGHTGBM.value, LGBMClassifier())]), TaskName.REGRESSION: Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()), (ModelName.LIGHTGBM.value, LGBMRegressor())]), FIT_PARAMS: { lightgbm_step_categorical_features_params: self.categorical_features } }, ModelName.LIGHTGBM_WITH_CATBOOST_ENCODER: { TaskName.CLASSIFICATION: Pipeline([(ModelName.CATBOOST_ENCODER.value, CatBoostEncoder()), (ModelName.LIGHTGBM.value, LGBMClassifier())]), TaskName.REGRESSION: Pipeline([(ModelName.CATBOOST_ENCODER.value, CatBoostEncoder()), (ModelName.LIGHTGBM.value, LGBMRegressor())]) }, ModelName.XGBOOST_WITH_CATBOOST_ENCODER: { TaskName.CLASSIFICATION: Pipeline([(ModelName.CATBOOST_ENCODER.value, CatBoostEncoder()), (ModelName.XGBOOST.value, XGBClassifier())]), TaskName.REGRESSION: Pipeline([(ModelName.CATBOOST_ENCODER.value, CatBoostEncoder()), (ModelName.XGBOOST.value, XGBRegressor())]) }, ModelName.XGBOOST: { TaskName.CLASSIFICATION: Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()), (ModelName.XGBOOST.value, XGBClassifier())]), TaskName.REGRESSION: Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()), (ModelName.XGBOOST.value, XGBRegressor())]) } }
def __init__(self, sparksess=None, logdir='/encoder', handle_unknown='-99999', save_encoder=False): self.spark = sparksess self.logdir = logdir self.save_encoder self.ordinal_encoder_features = [] self.onehot_encoder_features = [] self.count_encoder_features = [] self.target_encoder_features = [] self.ordinal_encoder = OrdinalEncoder( cols=self.ordinal_encoder_features, return_df=True, handle_unknown=handle_unknown) self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features, return_df=True, handle_unknown=handle_unknown) self.count_encoder = CountEncoder(cols=self.count_encoder_features, return_df=True, handle_unknown=handle_unknown) self.target_encoder = TargetEncoder(cols=self.target_encoder_features, return_df=True, handle_unknown=handle_unknown)
def xgb_reg(X_train, y_train, X_test, y_test): """ Simple pipeline Baseline model for using XGB Regressor including a ordinal encoder, standard scaler, simple imputer. This function returns Mean baseline, R^2, and RMSE. If R^2 is negative this means mean baseline is a more effective model """ s1 = pd.Series(y_train) s2 = pd.Series(y_test) s3 = s1.append(s2) mean = np.mean(s3) model = make_pipeline( OrdinalEncoder(), StandardScaler(), SimpleImputer(strategy='median'), XGBRegressor(n_estimators=100, n_jobs=-1, max_depth=10)) model.fit(X_train, y_train) y_pred = model.predict(X_test) r2 = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Mean baseline of target = {mean}') print(f'Gradient Boosting R^2 = {r2}') print(f'Gradient Boosting RMSE = {rmse}') return
def categoryEncode(df, cols=None, mode="binary"): if(mode == "ordinal"): encoder = OrdinalEncoder(cols=cols, handle_missing="return_nan", handle_unknown="return_nan") elif(mode == "binary"): encoder = BinaryEncoder(cols=cols) df_new = encoder.fit_transform(df) return df_new
def test_display_dataset_analysis_3(self, mock_correlation_matrix): """ Test we don't have a problem when only categorical features """ df = self.df.copy() df['x1'] = 'a' df['x2'] = df['x2'].astype(str) encoder = OrdinalEncoder( cols=['x1', 'x2'], handle_unknown='ignore', return_df=True).fit(df) df = encoder.transform(df) clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y']) xpl = SmartExplainer() xpl.compile(model=clf, x=df[['x1', 'x2']]) report = ProjectReport( explainer=xpl, project_info_file=os.path.join(current_path, '../../data/metadata.yaml'), x_train=df[['x1', 'x2']], ) report.display_dataset_analysis() self.assertEqual(mock_correlation_matrix.call_count, 0)
def _fit_learn_categorical_variables(train_df: DataFrame): """ Train encoder to transform categorical variables in numerical ones. Parameters ---------- train_df: DataFrame dataframe with categorical variables Returns ------- encoder: OrdinalEncoder encoder to change the """ cols_to_encode = ['country', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'variety'] encoder = OrdinalEncoder(cols=cols_to_encode, return_df=True) # Fit Data encoder.fit(train_df) return encoder
def encode_cat_col(self): enc = OrdinalEncoder(return_df=False).fit(self.categ_col) self.categ_col = enc.transform(self.categ_col) # DEBUG print(self.DS) print(self.categ_col)
def make_gridsearch(clf, param_grid, params): pipe = Pipeline([('encoder', OrdinalEncoder()), ('scaler', StandardScaler()), ('clf', clf)]) pipe.set_params(**params) grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3) return grid
def fit_label(input_df: pd.DataFrame, cols: List[str], na_value: Any = None): """ Creates the label encoder by fitting it through the given DataFrame NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value. Args: input_df: DataFrame used to fit the encoder cols: List of categorical columns to be encoded na_value: Default null value for DataFrame Returns: result_df: encoded input_df DataFrame model : encoder model to be passed to `transform_label` method """ df = input_df.copy() if na_value is not None: for col in cols: df[col] = df[col].replace({na_value: np.nan}) encoder = OrdinalEncoder(cols=cols) encoder = encoder.fit(df) for idx in range(len(encoder.mapping)): encoder.mapping[idx]["mapping"].loc[np.nan] = -2 result_df = encoder.transform(df) for col in cols: result_df[col] = result_df[col].replace({-1: 0, -2: 0}) result_df[col] = result_df[col].astype(int) model = {"encoder": encoder, "cols": cols, "na_value": na_value} return result_df, model
def encode_cat_col(self): # TODO pandas to numpy from category_encoders import OrdinalEncoder enc = OrdinalEncoder(return_df=False).fit(self.categ_col) self.categ_col = enc.transform(self.categ_col) # DEBUG print(self.DS) print(self.categ_col)
def predict(submit,input1, input2, input3, input4, input5, input6, input7, input8, input9, input10, input11, input12, input13, input14, input15, input16, input17, input18, input19, input20): team = {'CONF':input1, 'G':float(input2), 'ADJOE':float(input3), 'ADJDE':float(input4), 'BARTHAG':float(input5), 'EFG_O':float(input6), 'EFG_D':float(input7), 'TOR':float(input8), 'TORD':float(input9), 'ORB':float(input10), 'DRB':float(input11), 'FTR':float(input12), 'FTRD':float(input13), '2P_O':float(input14), '2P_D':float(input15), '3P_O':float(input16), '3P_D':float(input17),'ADJ_T':float(input18), 'WAB':float(input19), 'SEED':float(input20)} model = make_pipeline(OrdinalEncoder(), XGBClassifier(max_depth=5, learning_rate=0.001, n_estimators=500, n_jobs=-1, objective='multi:sotmax', eval_metric='merror', num_class=8, critereon = 'entropy')) model.fit(X_train, y_train) return model.predict([team])
def get_train_simple_pre_pipeline(): columns_pipe = get_columns_pipeline() pre_processor_pipe = Pipeline( steps=[('ordinal_encoder', OrdinalEncoder(cols=['tipodepropiedad', 'provincia', 'ciudad']) ), ('columns_pipe', columns_pipe)]) return pre_processor_pipe
def get_categorical_pipeline(): # Create the transformers for categorical features cat_features = [ #('categoricals', 'passthrough', CAT_FEAT), ('binary', OrdinalEncoder(), 'ZoomInfo_Global_HQ_Country'), ('catboost', OrdinalEncoder(handle_unknown='value', handle_missing='value'), 'Adjusted_Industry'), ('ordinal',OrdinalEncoder(mapping=MAP_ORDINAL, handle_unknown='value'), ORDINAL_FEATURES), ] cat_ct = ColumnTransformer(cat_features) #Create the pipeline to transform categorical features cat_pipeline = Pipeline([ ('cat_ct', cat_ct), #('ohe', OneHotEncoder(handle_unknown='ignore')) ]) return cat_pipeline
def process_data(X_train, X_test, X_val, X): """pre-process training data and transformation""" processor = make_pipeline(OrdinalEncoder(), SimpleImputer()) X_train = processor.fit_transform(X_train) X_val = processor.transform(X_val) X_test = processor.transform(X_test) encoded_cols = list(range(0, X.shape[1])) column_names = list(X.columns) features = dict(zip(encoded_cols, column_names)) return X_train, X_test, X_val, features, column_names, processor
def _cat_encoder(self, df): # start_time = time.time() df = df.fillna(0) from category_encoders import OrdinalEncoder if self.is_trained == False: enca = OrdinalEncoder().fit(df) self.catEncoder.append(enca) cat = self.catEncoder[0].transform(df) # print("cat_encoder______________",time.time()-start_time) return cat
def assign_cat_scaler(self,) : self.cat_method = self.cat_info.get("method", None) self.cat_cols = self.cat_info.get("cols", []) if self.cat_method is None : self.cat_encoder = Empty() elif self.cat_method == "OrdinalEncoder" : self.cat_encoder = OrdinalEncoder(cols = self.cat_cols) elif self.cat_method == "OneHotEncoder" : self.cat_encoder = OneHotEncoder(cols = self.cat_cols) else : raise NotImplementedError("아직 나머지 구현 안함")
def encode_result(df_orig): df = df_orig.copy(deep=True) mapping_dict = { 'col': 'result', 'mapping': { 'hwin': 1, 'draw': 2, 'awin': 3 } } ord_enc = OrdinalEncoder(mapping=[mapping_dict]) df['ordinal_result'] = ord_enc.fit_transform(df[['result']]) return df
def xgb_class(X_train, y_train, X_test, y_test): """ Baseline XGB Classifier that prints out ROC score for Train and Test sets provided. """ class_index = 1 # processor = make_pipeline( # ce.ordinal.OrdinalEncoder(), # SimpleImputer(strategy='median') # ) # X_train_processed = processor.fit_transform(X_train) # X_test_processed = processor.transform(X_test) encoder = OrdinalEncoder() imputer = SimpleImputer() X_train_encoded = encoder.fit(X_train) X_train_encoded = encoder.transform(X_train) X_train_imputed = imputer.fit_transform(X_train_encoded) X_test_encoded = encoder.fit(X_test) X_test_encoded = encoder.transform(X_test) X_test_imputed = imputer.fit_transform(X_test_encoded) model = XGBClassifier(n_estimators=100, n_jobs=-1, max_depth=10) model.fit(X_train_imputed, y_train, eval_metric='auc') # Getting the predicted probabilities y_pred = model.predict(X_test_processed) y_pred_proba_train = model.predict_proba(X_train_imputed)[:, class_index] y_pred_proba_test = model.predict_proba(X_test_imputed)[:, class_index] train_roc = roc_auc_score(y_train, y_pred_proba_train) test_roc = roc_auc_score(y_test, y_pred_proba_test) # Making a new Series for mean baseline print s1 = pd.Series(y_train) s2 = pd.Series(y_test) s3 = s1.append(s2) print('Mean Baseline of Target') print(s3.value_counts(normalize=True)) print() print(f'Train ROC AUC for class: {train_roc} \n') print(f'Test ROC AUC for class: {test_roc}') return
def __init__(self, file_): data_set = pd.read_csv(file_, index_col=0) target = np.array(data_set["is_obesity"]).reshape(-1, 1).ravel() del data_set['IID'] del data_set['log_BMI'] del data_set['is_obesity'] columns = "rs12620338,rs7559271,rs2234675,rs6436302,rs12053273,rs1430657,rs16863576,rs7589708,rs4674639,rs10932949,rs12995399,rs9768991,rs7809325,rs17879130,rs6964358,rs4724821,rs2410612,rs3816246,rs61734430,rs2651364,rs7963401,rs2733682,rs2651374,rs7132461,rs10771951,rs4931631,rs7299495,rs10844219,rs7311935,rs7963397,rs7295095,rs10844227,rs7977101,rs7966856,rs7967302,rs2088656,rs4931635,rs904582,rs10771966,rs6488068,rs7962152,rs4135048,rs4135060,rs3751209,rs140436257,rs4135113,rs4135126,rs2888805,rs2041794,rs2908792,rs12930428,rs2160290,rs4784311,rs13332406,rs76818213,rs1131220,rs3809634,rs3095631,rs17194040,rs1861556,rs16952304,rs7193898,rs1362572,rs12599436,rs1946155,rs4784320,rs12443767,rs3213758,rs17214955,rs8050354,rs139974543,rs2111119,rs2302677,rs9934800,rs5005161,rs7205986,rs1421084,rs7203521,rs6499640,rs4396532,rs1861868,rs1075440,rs13334933,rs9930333,rs9939973,rs9940128,rs1421085,rs16952520,rs1558902,rs10852521,rs1121980,rs7193144,rs17817449,rs11075987,rs8050136,rs9935401,rs9936385,rs9926289,rs76804286,rs9939609,rs9941349,rs7190492,rs9930506,rs9922708,rs9922619,rs8044769,rs12149832,rs10852523,rs3826169,rs10521307,rs17819033,rs7205009,rs2160481,rs4784329,rs7191718,rs9934504,rs9929152,rs12232391,rs9924072,rs12933996,rs17224310,rs17823199,rs7194907,rs6499662,rs12596210,rs8046658,rs7200972,rs9925908,rs12931859,rs7194243,rs4784351,rs2540781,rs856973,rs2003583,rs16953002,rs708258,rs1008400,rs11646512,rs11863548,rs2665271,rs2689264,rs8053279,rs8063722,rs879679,rs1610237,rs8054310,rs2542674,rs2689258,rs1033046,rs2010410,rs17835974,rs4783830,rs8060235,rs16953241,rs16953243,rs7200222,rs8049962,rs10521300,rs16953283,rs1126960,rs1868689,rs17176417,rs1079368,rs1004299,rs1004930,rs12930159,rs729633,rs8056104,rs2388632,rs7193399,rs11076030,rs12932839,rs7191827,rs8050506,rs11639567,rs17257349,rs7203944,rs1420303,rs1530793,rs4784379,rs7189231,rs9972796,rs1420285,rs4784390,rs12931301,rs12447674,rs9921518,rs4783845,rs17200070,rs11640012,rs12929998,rs733017,rs716083,rs751214,rs1362437,rs749622,rs8059628,rs1211435,rs1201336,rs1186817,rs1874025,rs8045161,rs8051442,rs1882591,rs1151277,rs11861365,rs2388773,rs1493897,rs8044756,rs1861532,rs11639521,rs17205999,rs16953856,rs1420562,rs2388807,rs1420553,rs1861538,rs4784415,rs12444481,rs1548912,rs7499390,rs4622506,rs4257585,rs4440156,rs7198507,rs9924618,rs11076057,rs4591143,rs6499720,rs4435250,rs4383140,rs4784429,rs4555155,rs9932117,rs11076060,rs12447300,rs13336114,rs1133611,rs11076063,rs11076064,rs8060082,rs4238773,rs12927600,rs4238775,rs13331158,rs4783863,rs8055853,rs4784467,rs6499743,rs16954195,rs4784474,rs1352191,rs7197624,rs11076070,rs8050248,rs1825730,rs16954308,rs11076076,rs4270172,rs8060698,rs12917822,rs8064192,rs1486735,rs1552426,rs7187108,rs8054239,rs11076081,rs2200537,rs9922031,rs1486733,rs12934198,rs2588996,rs2171262,rs17291845,rs7204268,rs2397376,rs9928598,rs12050985,rs4784510,rs1437449,rs16954658,rs991057,rs30922,rs30923,rs11860394,rs31045,rs31046,rs6499755,rs893263,rs31064,rs4784523,rs31103,rs31104,rs360774,rs30905,rs12918370,rs7199709,rs1370385,rs9926841,rs1610101,rs1420227,rs8045690,rs2540707,rs2576542,rs11643666,rs7184310,rs9936365,rs837537,rs7187242,rs7187258,rs11859163,rs17301608,rs2287074,rs7201,rs837550,rs2287072,rs112426189,rs3744374,rs12602590,rs11654604,rs200805689,rs117651561,rs79742527,rs143040759" preprocessor = ColumnTransformer(transformers=[ ('encoder', OrdinalEncoder(), columns.split(',')), ]) pipe = Pipeline([('preprocessor', preprocessor), ('model', LogisticRegression())]) self.classifier = pipe.fit(data_set, target) print('LogisticRegression score(accuracy) for ' + file_ + ' : ' + str(self.classifier.score(data_set, target)))
def _encode_categories(self): """ This private method stands for encoding categorical variables. Label encoding used for ordinal categories and one-hot encoding used for nominal categories. """ logging.info(f'#{self._index()} - Encoding categorical columns...') # get column names for categorical and numerical features categorical_vars = self.X.select_dtypes(include='object').columns numerical_vars = self.X.columns.difference(categorical_vars) ordinal = pd.Index([ 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC' ]) nominal = categorical_vars.difference(ordinal) standard_mapping = { 'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5 } mapping_for_ordinals = [{ 'col': column, 'mapping': standard_mapping } for column in ordinal] x_num = self.X[numerical_vars] x_test_num = self.X_test[numerical_vars] # one hot encode categorical columns one_hot_encoder = OneHotEncoder(use_cat_names=True) label_encoder = OrdinalEncoder(drop_invariant=True, mapping=mapping_for_ordinals, handle_unknown='error') x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal]) x_cat_ord = label_encoder.fit_transform(self.X[ordinal]) x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal]) x_test_cat_ord = label_encoder.transform(self.X_test[ordinal]) self.X = x_num.join(x_cat_ord).join(x_cat_nom) self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom) logging.info(f'#{self._step_index} - DONE!')
def _create_feature(cls, conf) -> pd.DataFrame: df = Application.get_df(conf)[[ 'SK_ID_CURR', 'TARGET', cls._col1, cls._col2 ]] df = OrdinalEncoder(cols=[cls._col1, cls._col2]).fit_transform(df) latent_vectors = lda(cls._n_components, df, cls._col1, cls._col2) dic = defaultdict(list) for v in latent_vectors: for i, s in enumerate(v): dic[f"{cls._col1}_LDA_{cls._col2}_dim{i}"].append(s) df_latent_vectors = pd.DataFrame(dic) return df.merge(df_latent_vectors, how="left", left_on=cls._col1, right_index=True).drop( ['TARGET', cls._col1, cls._col2], axis=1)
def __init__(self, df_train: pd.DataFrame, df_valid: pd.DataFrame, df_test: pd.DataFrame, use_columns, label_column): encoder = OrdinalEncoder(cols=use_columns, handle_unknown='impute').fit(df_train) df_train_X = encoder.transform(df_train).astype('int64') df_valid_X = encoder.transform(df_valid).astype('int64') df_test_X = encoder.transform(df_test).astype('int64') self.train_X = torch.from_numpy(df_train_X[use_columns].values).long() self.train_y = df_train[label_column].values self.valid_X = torch.from_numpy(df_valid_X[use_columns].values).long() self.valid_y = df_valid[label_column].values self.test_X = torch.from_numpy(df_test_X[use_columns].values).long() self.test_y = df_test[label_column].values field_dims = list(df_train_X[use_columns].max()) self.field_dims = list(map(add_one, field_dims)) self.data_num = self.train_X.size()[0]
def main(): print('started experimnent') with neptune.create_experiment( name='feature engineering', tags=['feature-extraction', FEATURE_NAME], upload_source_files=get_filepaths(), properties={'feature_version': FEATURE_NAME}): print('loading data') train = load_and_merge(RAW_DATA_PATH, 'train', NROWS)[ID_COLS + V1_COLS + ['isFraud']] test = load_and_merge(RAW_DATA_PATH, 'test', NROWS)[ID_COLS + V1_COLS] categorical_cols = set(V1_CAT_COLS) print('cleaning data') email_cols = ['P_emaildomain', 'R_emaildomain'] train, new_email_cols = clean_email(train, email_cols) test, _ = clean_email(test, email_cols) categorical_cols.update(new_email_cols) for col in email_cols: categorical_cols.remove(col) categorical_cols = list(categorical_cols) neptune.set_property('categorical_columns', str(categorical_cols)) print('encoding categoricals') encoder = OrdinalEncoder(cols=categorical_cols).fit( train[ID_COLS + categorical_cols]) train[ID_COLS + categorical_cols] = encoder.transform( train[ID_COLS + categorical_cols]) test[ID_COLS + categorical_cols] = encoder.transform( test[ID_COLS + categorical_cols]) train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_{}.csv'.format(FEATURE_NAME)) print('saving train to {}'.format(train_features_path)) train.to_csv(train_features_path, index=None) log_data_version(train_features_path, prefix='train_features_') test_features_path = os.path.join( FEATURES_DATA_PATH, 'test_features_{}.csv'.format(FEATURE_NAME)) print('saving test to {}'.format(test_features_path)) test.to_csv(test_features_path, index=None) log_data_version(test_features_path, prefix='test_features_')
def __init__(self, categorical_features, numerical_features, data, **kwargs): self.categorical_features = categorical_features self.numerical_features = numerical_features self.encoder = OrdinalEncoder( cols=self.categorical_features, return_df=True, handle_unknown="value" ) self.scaler = StandardScaler() self.target_scaler = StandardScaler() self.model = None self.data = data self.inputs = [] self.embeddings = [] # for c in self.categorical_features: # unique_values = data[c].unique() # logger.info(f"{c} -- {unique_values}") self.build_full_network(data, **kwargs)
def get_pipeline(est, is_tree, is_regressor, params): name = model_name(est) if name.startswith('Dummy'): ppl = Pipeline([ ('ft', FunctionTransformer()), ('mo', est) ]) params['ft__func'] = [lambda x:x[numeric_cols(x)]] params['ft__validate'] = [False] elif is_tree: ppl = Pipeline([ ('da', DateEncoder()), ('du', OrdinalEncoder()), ('ft', FunctionTransformer()), ('se', SelectKBest2()), ('mo', est) ]) params['da__ascategory'] = [False] params['du__drop_invariant'] = [True] params['ft__func'] = [lambda x:x.fillna(-999)] params['ft__validate'] = [False] params['se__score_func'] = get_selector(is_regressor, is_tree) params['se__k'] = [0.2, 0.5, 0.8, 1000, 1000] else: ppl = Pipeline([ ('da', DateEncoder()), ('en', FeatureUnion([ ('nu', Pipeline([('ft', FunctionTransformer()), ('in', Imputer()), ('sc', TransformerWrap(StandardScaler()))])), ('ca', Pipeline([('ft', FunctionTransformer()), ('sc', SparseCatEncoder())])) ])), ('fu', FeatureUnion([('se', SelectKBest2()), ('dr', TruncatedSVD2())])), ('mo', est) ]) params['en__nu__ft__func'] = [lambda x:x[numeric_cols(x)]] params['en__nu__ft__validate'] = [False] params['en__ca__ft__func'] = [lambda x:x[object_cols(x)]] params['en__ca__ft__validate'] = [False] params['fu__se__score_func'] = get_selector(is_regressor, is_tree) params['fu__se__k'] = [0.2, 0.5, 0.8, 1000] params['fu__dr__k'] = [0.2, 0.5, 0.8, 1000] return name, ppl, params
def out_of_folds_predict(X, y): callbacks = [ EarlyStopping( # Stop training when loss is no longer improving monitor="loss", # "no longer improving" being defined as "no better than 1e-2 less" min_delta=1e-5, # "no longer improving" being further defined as "for at least 2 epochs" patience=2, verbose=0, ) ] preds = np.zeros(X.shape[0]) n_splits = 4 if y.sum() < 2: kfold = KFold(n_splits=n_splits) else: kfold = StratifiedKFold(n_splits=n_splits) for i, (train_index, test_index) in enumerate(kfold.split(X, y)): print(f'Split {i+1} of {n_splits}...') pipe = build_pipe() X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] encoder = OrdinalEncoder() X_train = encoder.fit_transform(X_train, y_train).astype(np.float) pipe.fit(X_train, y_train, epochs=20, callbacks=callbacks, verbose=0) X_test = encoder.transform(X_test).astype(np.float) pipe.evaluate(X_test, y_test, verbose=1) preds[test_index] = pipe.predict(X_test).flatten() pipe = build_pipe() return preds
def create_label(df, test_df, topic): result_dict = {} feature_df = df[['title']].copy() label_df = df.drop(columns=['itemid', 'title', 'image_path']).copy() feature_df['title'] = feature_df['title'].apply(lambda x: text_process(x)) feature_array = feature_df['title'].values.tolist() feature_encoder = TfidfVectorizer() feature_encoder.fit(feature_array) feature_attr = feature_encoder.transform(feature_array) feature_decomposer = TruncatedSVD(500) feature_decomposer.fit(feature_attr) feature_attr = feature_decomposer.transform(feature_attr) test_df['title'] = test_df['title'].apply(lambda x: text_process(x)) test_array = test_df['title'].values.tolist() test_attr = feature_encoder.transform(test_array) test_attr = feature_decomposer.transform(test_attr) train_itemid = df['itemid'] test_itemid = test_df['itemid'] result_dict['itemid_train_{}'.format(topic)] = train_itemid result_dict['itemid_test_{}'.format(topic)] = test_itemid result_dict['X_train_{}'.format(topic)] = feature_attr result_dict['X_encoder_{}'.format(topic)] = feature_encoder result_dict['X_decomposer_{}'.format(topic)] = feature_decomposer result_dict['X_test_{}'.format(topic)] = test_attr for column in label_df.columns: label_encoder = OrdinalEncoder(cols=[column], handle_unknown='impute') label_encoder.fit(label_df[[column]]) label_attr = label_encoder.transform(label_df[[column]]) result_dict['Y_train_{}_{}'.format(topic, column)] = label_attr result_dict['Y_encoder_{}_{}'.format(topic, column)] = label_encoder result_dict['Y_colname_{}_{}'.format(topic, column)] = label_attr.columns return result_dict
def make_gridsearch(param_grid): clf_params = { 'colsample_bytree': 0.6522, 'gamma': 3.6975, 'learning_rate': 0.05, 'max_delta_step': 2.0706, 'max_depth': 10, 'min_child_weight': 31.5800, 'n_estimators': 166, 'subsample': 0.8639 } pipe = Pipeline([('encoder', OrdinalEncoder()), ('scaler', StandardScaler()), ('clf', XGBClassifier(**clf_params))]) gridsearch = GridSearchCV(pipe, param_grid=param_grid, scoring='neg_log_loss', cv=3) return gridsearch
def convert_meta_to_dict(self): meta = self.meta[['productid'] + self.META_COLS].copy() # Encode to int encoder = OrdinalEncoder(cols=self.META_COLS) meta = encoder.fit_transform(meta) save_model(encoder, '{}/encoder'.format(MODEL_PATH)) meta['values'] = meta.apply(get_dict_values, args=(self.META_COLS, ), axis=1) meta_dict = meta.set_index('productid')['values'].to_dict() meta_dict = {self.word2id[k]: v for k, v in meta_dict.items()} meta_counts_dict = ( meta[self.META_COLS].max() + 1).to_dict() # Need to +1 to account for index starting from zero # Without +1 the embedding size will be insufficient by 1 ordered_meta_counts_dict = OrderedDict() for col in ['product'] + self.META_COLS: ordered_meta_counts_dict[col] = meta_counts_dict.get(col, 0) return meta_dict, ordered_meta_counts_dict