def run(fold): # reading the data df = pd.read_csv(config.train_data_folds) num_cols = [ 'age', 'fnlwgt', 'capital.gain', 'capital.loss', 'hours.per.week' ] target_map = {'<=50K': 0, '>50K': 1} df.income = df.income.map(target_map) # cat cols cat_features = [ f for f in df.columns if f not in num_cols + ['income', 'kfold'] ] # feature engineering df = utils.feature_engineering(df, cat_features) # selecting the features features = [f for f in df.columns if f not in ['income', 'kfold']] # treating NANS for col in features: if col not in num_cols: df.loc[:, col] = df[col].astype(str).fillna('NONE') # label encoding for feat in features: if col not in num_cols: lbl_enc = preprocessing.LabelEncoder() lbl_enc.fit(df[feat]) df.loc[:, feat] = lbl_enc.transform(df[feat]) # splitting the data based on the folds created df_train = df[df.kfold != fold].reset_index(drop=True) df_valid = df[df.kfold == fold].reset_index(drop=True) x_train = df_train[features].values x_valid = df_valid[features].values # xgb model = xgb.XGBClassifier(n_jobs=-1, max_depth=7, n_estimators=200) model.fit(x_train, df_train.income.values) # AUC # taking the probability of 1 valid_pred = model.predict_proba(x_valid)[:, 1] auc = metrics.roc_auc_score(df_valid.income.values, valid_pred) print('Fold: ', fold, 'Validation AUC: ', auc)
def NB_model(): train_data, test_data = utils.prepare_data() print('训练集有{}条记录。'.format(len(train_data))) print('测试集有{}条记录。'.format(len(test_data))) X_train, X_test = utils.feature_engineering(train_data, test_data) print('共有{}维特征。'.format(X_train.shape[1])) y_train = train_data['label'].values y_test = test_data['label'].values #数据建模 nb_model = GaussianNB() nb_model.fit(X_train, y_train) y_pred = nb_model.predict(X_test) print('准确率:', accuracy_score(y_test, y_pred))
def SVC_model(): train_data, test_data = utils.prepare_data() print('训练集有{}条记录。'.format(len(train_data))) print('测试集有{}条记录。'.format(len(test_data))) X_train, X_test = utils.feature_engineering(train_data, test_data) print('共有{}维特征。'.format(X_train.shape[1])) y_train = train_data['label'].values y_test = test_data['label'].values #数据建模 c_values = [0.0001, 1, 10000] for c_value in c_values: svm_model = SVC(C=c_value) svm_model.fit(X_train, y_train) y_pred = svm_model.predict(X_test) print('准确率:', accuracy_score(y_test, y_pred))
data_chunk = data_chunk[data_chunk.object_id != arr[len(arr) - 1]] data_chunk = data_chunk.reset_index(drop=True) meta_chunk = test_meta[test_meta['object_id'].isin( data_chunk['object_id'].unique())] meta_chunk = meta_chunk.reset_index(drop=True) g_data, eg_data, g_meta, eg_meta = utils.gal_split_data( data_chunk, meta_chunk, False) g_features = None eg_features = None if g_meta.shape[0] > 0: #make meta not drop object_id in the feature engineering function g_features = utils.feature_engineering(g_data, g_meta, False) if i_c == 0: g_features.to_csv('test_g_features.csv', header=True, mode='a', index=False) else: g_features.to_csv('test_g_features.csv', header=False, mode='a', index=False) if eg_meta.shape[0] > 0: eg_features = utils.feature_engineering(eg_data, eg_meta, False) if i_c == 0:
import utils #path_to_data = '/courses/cs342/Assignment2/' path_to_data = '' train, train_meta = utils.load_train(path_to_data) g_train, eg_train, g_meta, eg_meta, g_target, eg_target = utils.gal_split_data( train, train_meta, True) g_features = utils.feature_engineering(g_train, g_meta) g_wtable, g_labels, g_classes, g_target_map = utils.preprocess_target(g_target) g_features = utils.standardize_data(g_features) utils.train_mlp(g_features, g_wtable, g_labels, g_classes, g_target_map, True) eg_features = utils.feature_engineering(eg_train, eg_meta) eg_wtable, eg_labels, eg_classes, eg_target_map = utils.preprocess_target( eg_target) eg_features = utils.standardize_data(eg_features) utils.train_mlp(eg_features, eg_wtable, eg_labels, eg_classes, eg_target_map, False)