def test_update_unique_vals(self): one_hot_encoder = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) one_hot_encoder._update_unique_vals(self.data) self.assertEqual(one_hot_encoder.unique_vals["Embarked"], set(['Q', np.nan, 'S', 'C'])) self.assertEqual(one_hot_encoder.unique_vals["Sex"], set(['male', 'female'])) self.assertEqual(one_hot_encoder.unique_vals["Pclass"], set([1, 2, 3]))
def test_transform(self): one_hot_encoder = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) one_hot_encoder.fit(self.data) transformed_data = np.array( [[0.0, 0.0, 1.0, 0.0, 1.0, 22.0, 7.25, 0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 1.0, 0.0, 38.0, 71.2833, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0, 0.0, 26.0, 7.925, 0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 1.0, 0.0, 35.0, 53.1, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0, 1.0, 35.0, 8.05, 0.0, 0.0, 0.0, 1.0]]) np_test.assert_array_equal(one_hot_encoder.transform(self.data.head()), transformed_data)
def test_fit(self): one_hot_encoder1 = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) one_hot_encoder2 = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) one_hot_encoder1.fit(self.data) one_hot_encoder2._update_unique_vals(self.data) one_hot_encoder2._fit_encoders() self.assertEqual(one_hot_encoder1.categorical_columns, one_hot_encoder2.categorical_columns) self.assertEqual(one_hot_encoder1.unique_vals, one_hot_encoder2.unique_vals) self.assertEqual(one_hot_encoder1.encoders, one_hot_encoder2.encoders)
def test_fit_transform(self): one_hot_encoder1 = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) one_hot_encoder2 = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) one_hot_encoder2.fit(self.data.head()) np_test.assert_array_equal( one_hot_encoder1.fit_transform(self.data.head()), one_hot_encoder2.transform(self.data.head()))
def test_class_init(self): one_hot_encoder = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) self.assertEqual(one_hot_encoder.categorical_columns, ["Pclass", "Sex", "Embarked"]) self.assertEqual(one_hot_encoder.unique_vals, defaultdict(set)) self.assertEqual(one_hot_encoder.encoders, { "Pclass": Encoder(), "Sex": Encoder(), "Embarked": Encoder() })
def data_enc(self): fp_lb_enc = self.datapath + 'lb_enc' # label encoding fp_oh_enc = self.datapath + "oh_enc" # one-hot encoding df_train_f = pd.read_csv(self.fp_train_f, index_col=None, chunksize=500000, iterator=True) df_test_f = pd.read_csv(self.fp_test_f, index_col=None, chunksize=500000, iterator=True) lb_enc = {} # for col in self.cols: # self.cols_index[col] = np.append(self.cols_index[col], 'other') # 添加新 other # for col in self.cols: # lb_enc[col] = LabelEncoder() # %% one-hot and label encode print('starting one-hot and label encoding...') oh_enc = OneHotEncoder(self.cols) for chunk in df_train_f: oh_enc.fit(chunk) # dummy的one-hot不会 # for col in self.cols: # lb_enc[col].fit(chunk[col]) # fit会重置 for chunk in df_test_f: oh_enc.fit(chunk) # for col in self.cols: # lb_enc[col].fit(chunk[col]) # pickle.dump(lb_enc, open(fp_lb_enc, 'wb')) pickle.dump(oh_enc, open(fp_oh_enc, 'wb'))
def test_transform_coo(self): one_hot_encoder = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) one_hot_encoder.fit(self.data) coo_matrix_1 = one_hot_encoder.transform(self.data.head(), dtype="coo") coo_matrix_2 = coo_matrix( one_hot_encoder.transform(self.data.head(), dtype="np")) np_test.assert_array_equal(coo_matrix_1.toarray(), coo_matrix_2.toarray())
def test_fit_encoders(self): one_hot_encoder = OneHotEncoder( categorical_columns=["Pclass", "Sex", "Embarked"]) one_hot_encoder._update_unique_vals(self.data) one_hot_encoder._fit_encoders() embarked_encoder = Encoder() embarked_encoder.fit(set(['Q', np.nan, 'S', 'C'])) self.assertEqual(one_hot_encoder.encoders["Embarked"], embarked_encoder) sex_encoder = Encoder() sex_encoder.fit(set(['male', 'female'])) self.assertEqual(one_hot_encoder.encoders["Sex"], sex_encoder) pclass_encoder = Encoder() pclass_encoder.fit(set([1, 2, 3])) self.assertEqual(one_hot_encoder.encoders["Pclass"], pclass_encoder)
as we do it later in the iteration of model training on chunks ''' ## 1.label encoding lb_enc = {} for col in cols: col_index[col] = np.append(col_index[col], 'other') for col in cols: lb_enc[col] = LabelEncoder() lb_enc[col].fit(col_index[col]) ## store the label encoder pickle.dump(lb_enc, open(fp_lb_enc, 'wb')) ## 2.one-hot encoding oh_enc = OneHotEncoder(cols) df_train_f = pd.read_csv(fp_train_f, index_col=None, chunksize=500000, iterator=True) df_test_f = pd.read_csv(fp_test_f, index_col=None, chunksize=500000, iterator=True) for chunk in df_train_f: oh_enc.fit(chunk) for chunk in df_test_f: oh_enc.fit(chunk) ## store the one-hot encoder pickle.dump(oh_enc, open(fp_oh_enc, 'wb')) #----- construct of original train set (sub-sampling randomly) -----# n = sum(1 for line in open(fp_train_f)) - 1 # total size of train data (about 46M) s = 2000000 # desired train set size (2M)
k = 100 col_index = {} for col in cols: col_index[col] = cols_counts[col][0:k - 1].index #print(col, col_index[col]) ## 对分类变量进行标签编码 lb_enc = {} for col in cols: # 超过前100个value,设置为other col_index[col] = np.append(col_index[col], 'other') for col in cols: lb_enc[col] = LabelEncoder() lb_enc[col].fit(col_index[col]) ## 存储标签编码 pickle.dump(lb_enc, open(label_encoder_file, 'wb')) print(label_encoder_file + ' saved') ## one-hot编码 oh_enc = OneHotEncoder(cols) for chunk in df_train_org: oh_enc.fit(chunk) for chunk in df_test_org: oh_enc.fit(chunk) ## 存储one-hot编码 pickle.dump(oh_enc, open(onehot_encoder_file, 'wb')) print(onehot_encoder_file + ' saved')
log_loss_gbdt = log_loss(y_valid, y_pred_gbdt) print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt) ## store the pre-trained gbdt_model pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb')) del X_train_gbdt del y_train_gbdt gc.collect() gbdt_model = pickle.load(open(fp_gbdt_model, 'rb')) #----- data for LR (one-hot encoding with GDBT output) -----# id_cols = [] for i in range(1, gbdt_model.get_params()['n_estimators'] + 1): id_cols.append('tree' + str(i)) oh_enc = OneHotEncoder(id_cols) def chunker(seq, size): return (seq[pos:pos + size] for pos in range(0, len(seq), size)) ## oh_enc fit the train_set df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0], columns=id_cols, dtype=np.int8) for chunk in chunker(df_train_id, 50000): oh_enc.fit(chunk) del df_train_id
def train_model(white_list, isSortedTopKfeatures=False): ''' Train the ds_model ''' # In[4]: # Load Train and Test CSV headerNames = [ "id", "name", "first", "last", "compas_screening_date", "sex", "dob", "age", "age_cat", "race", "juv_fel_count", "decile_score", "juv_misd_count", "juv_other_count", "priors_count", "days_b_screening_arrest", "c_jail_in", "c_jail_out", "c_case_number", "c_offense_date", "c_arrest_date", "c_days_from_compas", "c_charge_degree", "c_charge_desc", "is_recid", "num_r_cases", "r_case_number", "r_charge_degree", "r_days_from_arrest", "r_offense_date", "r_charge_desc", "r_jail_in", "r_jail_out", "is_violent_recid", "num_vr_cases", "vr_case_number", "vr_charge_degree", "vr_offense_date", "vr_charge_desc", "v_type_of_assessment", "v_decile_score", "v_score_text", "v_screening_date", "type_of_assessment", "decile_score", "score_text", "screening_date" ] prefix = "./data/" # ID cannot be used for prediction # hence setting index_col = 0 takes care of removing ID field from dataset in both train and test dataframes. datadf = pd.read_csv(prefix + "compas-scores.csv", header=None, delim_whitespace=False, names=headerNames, index_col=0, skiprows=1) # In[5]: #['a','b'] ## Drop columns not useful at all if 'id' in datadf: datadf = datadf.drop('id', axis=1) if 'name' in datadf: datadf = datadf.drop('name', axis=1) if 'first' in datadf: datadf = datadf.drop('first', axis=1) if 'last' in datadf: datadf = datadf.drop('last', axis=1) if 'c_case_number' in datadf: datadf = datadf.drop('c_case_number', axis=1) if 'r_case_number' in datadf: datadf = datadf.drop('r_case_number', axis=1) if 'vr_case_number' in datadf: datadf = datadf.drop('vr_case_number', axis=1) if 'decile_score.1' in datadf: datadf = datadf.drop('decile_score.1', axis=1) if 'c_charge_desc' in datadf: datadf = datadf.drop('c_charge_desc', axis=1) if 'r_charge_desc' in datadf: datadf = datadf.drop('r_charge_desc', axis=1) if 'vr_charge_desc' in datadf: datadf = datadf.drop('vr_charge_desc', axis=1) if 'num_r_cases' in datadf: datadf = datadf.drop('num_r_cases', axis=1) if 'num_vr_cases' in datadf: datadf = datadf.drop('num_vr_cases', axis=1) if 'v_score_text' in datadf: datadf = datadf.drop('v_score_text', axis=1) if 'dob' in datadf: datadf = datadf.drop('dob', axis=1) if 'vr_charge_degree' in datadf: datadf = datadf.drop('vr_charge_degree', axis=1) if 'c_charge_degree' in datadf: datadf = datadf.drop('c_charge_degree', axis=1) if 'r_charge_degree' in datadf: datadf = datadf.drop('r_charge_degree', axis=1) if 'c_charge_degree' in datadf: datadf = datadf.drop('c_charge_degree', axis=1) ## check if 'v_decile_score' in datadf: datadf = datadf.drop('v_decile_score', axis=1) if 'c_jail_out' in datadf: datadf = datadf.drop('c_jail_out', axis=1) if 'r_jail_out' in datadf: datadf = datadf.drop('r_jail_out', axis=1) ## days_b_screening_arrest, c_days_from_compas ,r_days_from_arrest if 'days_b_screening_arrest' in datadf: datadf = datadf.drop('days_b_screening_arrest', axis=1) if 'c_days_from_compas' in datadf: datadf = datadf.drop('c_days_from_compas', axis=1) if 'r_days_from_arrest' in datadf: datadf = datadf.drop('r_days_from_arrest', axis=1) # In[6]: print(datadf.shape) # In[7]: ''' sns.heatmap(datadf.corr(),annot=True,cmap='RdYlGn',linewidths=0.2) #data.corr()-->correlation matrix fig=plt.gcf() fig.set_size_inches(20,16) #plt.show() fig.savefig('Correlation_before.png') # In[8]: fig=plt.gcf() datadf.hist(figsize=(18, 16), alpha=0.5, bins=50) plt.show() fig.savefig('histograms1.png') ''' # In[9]: datadf.head(10) # In[10]: ## fill NaN for categorical #datadf['v_score_text'].fillna(datadf['v_score_text'].value_counts().index[0], inplace=True) #datadf['vr_charge_degree'].fillna(datadf['vr_charge_degree'].value_counts().index[0], inplace=True) #datadf['c_charge_desc'].fillna(datadf['c_charge_desc'].value_counts().index[0], inplace=True) #datadf['r_charge_desc'].fillna(datadf['r_charge_desc'].value_counts().index[0], inplace=True) #datadf['vr_charge_desc'].fillna(datadf['vr_charge_desc'].value_counts().index[0], inplace=True) # In[11]: ''' datadf['vr_charge_degree'] = datadf['vr_charge_degree'].str.replace('[^a-zA-Z]',' ') datadf['vr_charge_degree'] = datadf['vr_charge_degree'].str.replace('[^a-zA-Z]',' ') datadf['v_score_text'] = datadf['v_score_text'].str.replace('[^a-zA-Z]',' ') datadf['v_score_text'] = datadf['v_score_text'].str.replace('[^a-zA-Z]',' ') ''' # In[12]: if 'age' in datadf: datadf = datadf.drop('age', axis=1) # In[13]: encoder = OneHotEncoder([ "sex", "race", "v_type_of_assessment", "age_cat", "type_of_assessment" ]) # ,"v_score_text","c_charge_desc","r_charge_desc", "vr_charge_desc", encoder.fit(datadf) encoder.transform(datadf).shape encoder.transform(datadf).head(10) # In[14]: datadf = encoder.transform(datadf) print("DF shape >>>>>>>>>>>>>>>> ", datadf.shape) print("DF columsn >>>>>>>>>>>>>>>> ", datadf.columns) # In[16]: '''# Set of Unique Values print(traindf['sex'].unique()) print(traindf['age_cat'].unique()) print(traindf['race'].unique()) print(traindf['score_text'].unique()) print(traindf['r_charge_desc'].unique()) print(traindf['c_charge_desc'].unique()) print(traindf['c_charge_degree'].unique()) print(traindf['r_charge_degree'].unique()) print(traindf['r_charge_desc'].unique()) print(traindf['vr_charge_desc'].unique()) print(traindf['v_type_of_assessment'].unique()) print(traindf['v_score_text'].unique()) print(traindf['score_text'].unique()) traindf.columns ''' # In[17]: # Test Data stats datadf.describe() # In[18]: # age for different date fields #datadf['dob'] = pd.to_datetime(datadf['dob'], dayfirst=True) datadf['compas_screening_date'] = pd.to_datetime( datadf['compas_screening_date'], dayfirst=True) datadf['c_offense_date'] = pd.to_datetime(datadf['c_offense_date'], dayfirst=True) datadf['c_arrest_date'] = pd.to_datetime(datadf['c_arrest_date'], dayfirst=True) datadf['r_offense_date'] = pd.to_datetime(datadf['r_offense_date'], dayfirst=True) datadf['vr_offense_date'] = pd.to_datetime(datadf['vr_offense_date'], dayfirst=True) datadf['v_screening_date'] = pd.to_datetime(datadf['v_screening_date'], dayfirst=True) datadf['screening_date'] = pd.to_datetime(datadf['screening_date'], dayfirst=True) datadf['c_jail_in'] = pd.to_datetime(datadf['c_jail_in'], dayfirst=True) #datadf['c_jail_out'] = pd.to_datetime(datadf['c_jail_out'], dayfirst=True) datadf['r_jail_in'] = pd.to_datetime(datadf['r_jail_in'], dayfirst=True) #datadf['r_jail_out'] = pd.to_datetime(datadf['r_jail_out'], dayfirst=True) ## ages #datadf['Age_in_days'] = (datadf['compas_screening_date']-datadf['dob'])/timedelta(days=1) datadf['c_offense_age_in_days'] = (datadf['compas_screening_date'] - datadf['c_offense_date']) / timedelta( days=1) datadf['c_arrest_age_in_days'] = (datadf['compas_screening_date'] - datadf['c_arrest_date']) / timedelta( days=1) datadf['r_offense_age_in_days'] = (datadf['compas_screening_date'] - datadf['r_offense_date']) / timedelta( days=1) datadf['vr_offense_age_in_days'] = (datadf['compas_screening_date'] - datadf['vr_offense_date']) / timedelta( days=1) datadf['v_screening_age_in_days'] = ( datadf['compas_screening_date'] - datadf['v_screening_date']) / timedelta(days=1) datadf['screening_age_in_days'] = (datadf['compas_screening_date'] - datadf['screening_date']) / timedelta( days=1) datadf['c_jail_in_age_in_days'] = (datadf['compas_screening_date'] - datadf['c_jail_in']) / timedelta(days=1) #datadf['c_jail_out_age_in_days'] = (datadf['compas_screening_date']-datadf['c_jail_out'])/timedelta(days=1) datadf['r_jail_in_age_in_days'] = (datadf['compas_screening_date'] - datadf['r_jail_in']) / timedelta(days=1) #datadf['r_jail_out_age_in_days'] = (datadf['compas_screening_date']-datadf['r_jail_out'])/timedelta(days=1) print("white_list ", white_list) if len(white_list) > 0: white_list.append('decile_score') datadf.drop(datadf.columns.difference(white_list), 1, inplace=True) print("datadf ", datadf.columns) ## drop all date cols if 'dob' in datadf: datadf = datadf.drop('dob', axis=1) if 'compas_screening_date' in datadf: datadf = datadf.drop('compas_screening_date', axis=1) if 'c_offense_date' in datadf: datadf = datadf.drop('c_offense_date', axis=1) if 'c_arrest_date' in datadf: datadf = datadf.drop('c_arrest_date', axis=1) if 'r_offense_date' in datadf: datadf = datadf.drop('r_offense_date', axis=1) if 'vr_offense_date' in datadf: datadf = datadf.drop('vr_offense_date', axis=1) if 'screening_date' in datadf: datadf = datadf.drop('screening_date', axis=1) if 'v_screening_date' in datadf: datadf = datadf.drop('v_screening_date', axis=1) if 'c_jail_in' in datadf: datadf = datadf.drop('c_jail_in', axis=1) if 'c_jail_out' in datadf: datadf = datadf.drop('c_jail_out', axis=1) if 'r_jail_in' in datadf: datadf = datadf.drop('r_jail_in', axis=1) if 'r_jail_out' in datadf: datadf = datadf.drop('r_jail_out', axis=1) #prediction column - textual (decile_score is the numeric equivalent) if 'score_text' in datadf: datadf = datadf.drop('score_text', axis=1) # In[19]: # stats of categorical features datadf.describe(include=['O']) # In[20]: print(datadf.shape) datadf.head(10) # In[21]: # for starters, fill every nan value with mean column values across the dataset. #fill NaN values with mean #datadf['r_jail_in_age_in_days'].fillna(datadf['r_jail_in_age_in_days'].dropna().mean(), inplace=True) datadf[:] = datadf[:].fillna(0) ''' datadf['r_jail_in_age_in_days'].fillna(0) datadf['r_jail_out_age_in_days'].fillna(0) datadf['c_jail_in_age_in_days'].fillna(0) datadf['c_jail_out_age_in_days'].fillna(0) datadf['vr_offense_age_in_days'].fillna(0) datadf['r_offense_age_in_days'].fillna(0) datadf['c_arrest_age_in_days'].fillna(0) datadf['c_offense_age_in_days'].fillna(0) ''' # In[22]: datadf.to_csv('datadf_dt.csv', index=False) # In[23]: # check if any null values are still present print(datadf.columns[datadf.isnull().any()].tolist()) # In[24]: #sample data for a quick run ## TODO removenext line traindf, testdf = train_test_split(datadf, random_state=42, test_size=0.3) print(traindf.shape) print(testdf.shape) # In[25]: from sklearn import preprocessing print(traindf.columns) print(traindf.columns[traindf.isnull().any()].tolist()) # In[26]: ## Prediction ds_model -TRAIN DF print(traindf.columns) train_features = traindf.loc[:, traindf.columns != 'decile_score'] print(train_features.columns.values) # extract label from training set - Approved train_label = traindf.loc[:, traindf.columns == 'decile_score'] print(train_label.columns) ## Prediction ds_model - TEST DF print(traindf.columns) test_features = testdf.loc[:, testdf.columns != 'decile_score'] print(test_features.columns) print(test_features.head(10)) # extract label from training set - Approved test_label = testdf.loc[:, testdf.columns == 'decile_score'] print(test_label.columns) # In[27]: #get_ipython().run_cell_magic('time', '', "#Train the ds_model with best parameters of RF\n# best params for RF using randomizedCV\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\n\n\nds_model = make_pipeline(StandardScaler(with_std=True), \n OneVsRestClassifier(\n DecisionTreeClassifier(random_state=42,min_samples_split=9)\n )) # n_es = 200, lr 0.001\n\n'''\nds_model = make_pipeline(StandardScaler(with_std=True, with_mean=True), \n MLPClassifier(activation='relu', alpha=10.0, batch_size='auto', beta_1=0.9,\n beta_2=0.999, early_stopping=False, epsilon=1e-08,\n hidden_layer_sizes=(7, 7), learning_rate='adaptive',\n learning_rate_init=0.001, max_iter=500, momentum=0.9,\n nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,\n solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,\n warm_start=False))\n\nds_model = make_pipeline(StandardScaler(with_std=True), \n OneVsRestClassifier(\n ExtraTreesClassifier(n_estimators=98,min_samples_split=10\n ,max_leaf_nodes=8,max_features='log2',max_depth=3,criterion='entropy')\n ))\n'''\n\nds_model.fit(train_features, train_label)\ntrain_pred = ds_model.predict(train_features)\n\nprint(metrics.accuracy_score(train_label, train_pred)) # Training Accuracy Score\nprint (np.sqrt(mean_squared_error(train_label, train_pred))) # Training RMSE\n#print(roc_auc_score(train_label, train_pred)) # AUC-ROC values") #Train the model with best parameters of RF # best params for RF using randomizedCV from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA ds_model = make_pipeline( StandardScaler(with_std=True), OneVsRestClassifier( DecisionTreeClassifier( random_state=42, min_samples_split=9))) # n_es = 200, lr 0.001 ''' ds_model = make_pipeline(StandardScaler(with_std=True, with_mean=True), MLPClassifier(activation='relu', alpha=10.0, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(7, 7), learning_rate='adaptive', learning_rate_init=0.001, max_iter=500, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False)) ds_model = make_pipeline(StandardScaler(with_std=True), OneVsRestClassifier( ExtraTreesClassifier(n_estimators=98,min_samples_split=10 ,max_leaf_nodes=8,max_features='log2',max_depth=3,criterion='entropy') )) ''' ds_model.fit(train_features, train_label) train_pred = ds_model.predict(train_features) train_acc = metrics.accuracy_score(train_label, train_pred) print(train_acc) # Training Accuracy Score print(np.sqrt(mean_squared_error(train_label, train_pred))) # Training RMSE #print(roc_auc_score(train_label, train_pred)) # AUC-ROC values # In[29]: # In[30]: #test_pred = ds_model.predict_proba(testdf) #test features are all in testdata test_pred = ds_model.predict( test_features) #test features are all in testdata print(metrics.accuracy_score(train_label, train_pred)) # Training Accuracy Score print(np.sqrt(mean_squared_error(train_label, train_pred))) # Training RMSE test_pred_prob = ds_model.predict_proba( test_features) #test features are all in testdata print("ds_model.classes_ :: ", ds_model.classes_) print( "****************************************************************************************" ) print("Predicted Output >>>>>>>>> ", test_pred_prob) # Predicted Values print( "****************************************************************************************" ) print("test_pred[:,1] >> ", test_pred_prob[:, 1][0]) print(metrics.accuracy_score(test_label, test_pred)) # Testing Accuracy Score print(np.sqrt(mean_squared_error(test_label, test_pred))) # Testing RMSE top_k_features = getSortedTopKfeatures(train_features, train_label) #top_k_features.append('decile_score') #return {"pred_accu" : train_acc} all_feat = train_features.columns.tolist() all_feat = list(set(all_feat) - set(top_k_features)) ans = { "pred_accu": train_acc, "topk": top_k_features, "all_feat": all_feat } return ans
log_loss_gbdt = log_loss(y_valid, y_pred_gbdt) print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt) ## store the pre-trained gbdt_model pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb')) del X_train_gbdt del y_train_gbdt gc.collect() gbdt_model = pickle.load(open(fp_gbdt_model, 'rb')) #----- data for LR (one-hot encoding with GDBT output) -----# id_cols = [] for i in range(1, gbdt_model.get_params()['n_estimators']+1): id_cols.append('tree'+str(i)) oh_enc = OneHotEncoder(id_cols) def chunker(seq, size): return (seq[pos: pos + size] for pos in range(0, len(seq), size)) ## oh_enc fit the train_set df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0], columns=id_cols, dtype=np.int8) for chunk in chunker(df_train_id, 50000): oh_enc.fit(chunk) del df_train_id del X_train_org del y_train_org gc.collect()
#przed wykonaniem kodu upewnij się, że masz około 20 GB miejsca na dysku #zajmie to prawdopodobnie około 1,5h #importing necessary libraries import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from dummyPy import OneHotEncoder from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler #creating classes labelencoder_c1 = LabelEncoder() labelencoder_c2 = LabelEncoder() labelencoder_c3 = LabelEncoder() ohe = OneHotEncoder(['category_1', 'category_2', 'category_3']) sc = StandardScaler() #fitting the label encoder d1 = pd.read_csv('train2.tsv', sep = '\t') d2 = pd.read_csv('test_stg2.tsv', sep = '\t') category_list = pd.DataFrame(pd.concat([d1['category_name'], d2['category_name']])) category_list['category_name'] = category_list['category_name'].astype('str') category_list['category_name'] = category_list['category_name'].replace('nan', 'no_category1/no_category2/no_category3') categories = category_list['category_name'].str.split('/', 3, expand = True) category_list['category_1'] = categories[0] category_list['category_2'] = categories[1] category_list['category_3'] = categories[2] labelencoder_c1.fit(category_list['category_1']) labelencoder_c2.fit(category_list['category_2']) labelencoder_c3.fit(category_list['category_3'])