def knnTuning(): train = getTrainingData('train.csv', visualize=False) X = train.drop(['Exited'], axis=1) sc = StandardScaler() X = sc.fit_transform(X) y = train.Exited # split training data half half X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) params = { "n_neighbors": list(range(5, 131, 2)), "weights": ['uniform', 'distance'] } model = neighbors.KNeighborsClassifier() grid_search_cv = GridSearchCV(model, params, verbose=1, n_jobs=-1, cv=3, scoring='accuracy') # print(grid_search_cv.best_params_) grid_search_cv.fit(X_train, y_train) print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True) print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False) ROC(grid_search_cv, X_train, y_train, X_test, y_test, train=True) ROC(grid_search_cv, X_train, y_train, X_test, y_test, train=False) results = pd.DataFrame(grid_search_cv.cv_results_) printFullRow(results[results['rank_test_score'] == 1]) # best param setting: n_neighbors == 11/13, p ==2, weights = distance return
def main(): # make world # the initialise_screen method is a bit weird s_length = width*(nrow+1)*3/4 s_width = hex_utils.get_cross_width(width)*(nrow + 1) screen, background = initialise_screen(s_width, s_length, "Hex Snake") draw_grid(width, nrow, background) clock = pygame.time.Clock() score = 0 # Make characters snake = HexSnake(8, 1) apple = move_apple(HexApple(), snake) # it checks for the tangle a little bit too late for length 2 snake while not snake.is_tangled(): clock.tick(1) screen.blit(background, (0, 0)) apple.draw(screen) snake.draw(screen) eaten = is_eating(snake, apple) # checks if snake touching apple and update next snake.move_snake(get_control(), eating=eaten) if eaten: score += 1 move_apple(apple, snake) pygame.display.flip() while True: for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() print_score(score, screen)
def adaboost(X_train, X_test, y_train, y_test): ab = AdaBoostClassifier(n_estimators=100, learning_rate=1, random_state=42) ab.fit(X_train, y_train) print_score(ab, X_train, y_train, X_test, y_test, train=True) print_score(ab, X_train, y_train, X_test, y_test, train=False) ROC(ab, X_train, y_train, X_test, y_test, train=True) ROC(ab, X_train, y_train, X_test, y_test, train=False)
def lstm_test(sents, tags): model = LSTM() model.load_state_dict(torch.load("blstm.pkl")) tags = vec_flat(tags) tags_p = [] for s in sents: out = model(torch.unsqueeze(a2ft(s), 0)) out = out.data.numpy() for out_i in out: max_idx = np.argmax(out_i) tags_p.append(max_idx) utils.print_score(tags, tags_p) return utils.get_score(tags, tags_p)
def get_baseline(to_type='euler'): directory = DATA_DIR+to_type+'/valid/' actions_dict = {} for action in ACTIONS: cond_seq = __get_data([glob.glob(directory+action+'_*1-cond.npy')[0], glob.glob(directory+action+'_*2-cond.npy')[0]]) gt_seq = __get_data([glob.glob(directory+action+'_*1-gt.npy')[0], glob.glob(directory+action+'_*2-gt.npy')[0]]) actions_dict[action] = (cond_seq, gt_seq) # now, same as in # https://github.com/una-dinosauria/human-motion-prediction/blob/master/src/baselines.py#L184 errs_constant_frame = __running_average(actions_dict, ACTIONS, 1, to_type) running_average_2 = __running_average(actions_dict, ACTIONS, 2, to_type) running_average_4 = __running_average(actions_dict, ACTIONS, 4, to_type) utils.print_score(errs_constant_frame, 'Zero-velocity (running avg. 1)', ACTIONS) print '' utils.print_score(running_average_2, 'Runnning avg. 2', ACTIONS) print '' utils.print_score(running_average_4, 'Runnning avg. 4', ACTIONS)
strides=1, activation='relu', input_shape=image_shape)) net.add(Conv2D(64, kernel_size=4, strides=2, activation='relu')) net.add(Dropout(0.5)) net.add(Conv2D(128, kernel_size=4, strides=1, activation='relu')) net.add(Conv2D(128, kernel_size=4, strides=2, activation='relu')) net.add(Dropout(0.5)) net.add(Conv2D(256, kernel_size=4, strides=1, activation='relu')) net.add(Conv2D(256, kernel_size=4, strides=2, activation='relu')) net.add(Flatten()) net.add(Dropout(0.5)) net.add(Dense(512, activation='relu')) net.add(Dense(n_classes, activation='softmax')) net.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["accuracy"]) history = net.fit_generator(train_generator, epochs=100, steps_per_epoch=100, verbose=1, validation_data=val_generator) print_score(net, train_generator, val_generator) save_history(history, 'history_2.pk') net.save('model_2.h5') del net
def lgbm(train, y, test, features, model_params, WANDB_USE, categorical_features="auto", folds=10): if WANDB_USE: wandb.config.update(model_params) x_train = train[features] x_test = test[features] # 테스트 데이터 예측값을 저장할 변수 test_preds = np.zeros(x_test.shape[0]) # Out Of Fold Validation 예측 데이터를 저장할 변수 y_oof = np.zeros(x_train.shape[0]) # 폴드별 평균 Validation 스코어를 저장할 변수 score = 0 # 피처 중요도를 저장할 데이터 프레임 선언 feature_importance = pd.DataFrame() feature_importance["feature"] = features # Stratified K Fold skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED) # TimeSeriesSplit # skf = TimeSeriesSplit(n_splits=24) for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)): # train index, validation index로 train 데이터를 나눔 x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features] y_tr, y_val = y[tr_idx], y[val_idx] print( f"fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}" ) # LightGBM 데이터셋 선언 dtrain = lgb.Dataset(x_tr, label=y_tr) dvalid = lgb.Dataset(x_val, label=y_val) # LightGBM 모델 훈련 clf = lgb.train( model_params, dtrain, valid_sets=[dtrain, dvalid], # Validation 성능을 측정할 수 있도록 설정 categorical_feature=categorical_features, verbose_eval=200, ) # Validation 데이터 예측 val_preds = clf.predict(x_val) # Validation index에 예측값 저장 y_oof[val_idx] = val_preds # 폴드별 Validation 스코어 측정 print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}") print("-" * 80) if WANDB_USE: wandb.log({"AUC": roc_auc_score(y_val, val_preds)}) # score 변수에 폴드별 평균 Validation 스코어 저장 score += roc_auc_score(y_val, val_preds) / folds # 테스트 데이터 예측하고 평균해서 저장 test_preds += clf.predict(x_test) / folds # 폴드별 피처 중요도 저장 feature_importance[f"fold_{fold+1}"] = clf.feature_importance() del x_tr, x_val, y_tr, y_val gc.collect() # 폴드별 Validation 스코어 출력 & Out Of Fold Validation 스코어 출력 print(f"\nMean AUC = {score}") print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # 평가 지표 출력 함수 print_score(y, y_oof, WANDB_USE) # 폴드별 피처 중요도 평균값 계산해서 저장 fi_cols = [col for col in feature_importance.columns if "fold_" in col] feature_importance["importance"] = feature_importance[fi_cols].mean(axis=1) # feature 중요도 출력 print(feature_importance) feature_importance.to_csv( f'/opt/ml/code/output/fi_lgbm_{FEEATURE_FILE_NAME}.csv') # plot_feature_importances(feature_importance) # plot_roc_curve(y, y_oof) return y_oof, test_preds
def xgboost(train, y, test, features, model_params, WANDB_USE, folds=10): if WANDB_USE: wandb.config.update(model_params) x_train = train[features] x_test = test[features] # 테스트 데이터 예측값을 저장할 변수 test_preds = np.zeros(x_test.shape[0]) # Out Of Fold Validation 예측 데이터를 저장할 변수 y_oof = np.zeros(x_train.shape[0]) # 폴드별 평균 Validation 스코어를 저장할 변수 score = 0 # 피처 중요도를 저장할 데이터 프레임 선언 feature_importance = pd.DataFrame() feature_importance['feature'] = features # Stratified K Fold 선언 skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED) for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)): # train index, validation index로 train 데이터를 나눔 x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features] y_tr, y_val = y[tr_idx], y[val_idx] print( f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}' ) # XGBoost 데이터셋 선언 dtrain = xgb.DMatrix(x_tr, label=y_tr) dvalid = xgb.DMatrix(x_val, label=y_val) # XGBoost 모델 훈련 clf = xgb.train( model_params, dtrain, num_boost_round=10000, # 트리 개수 evals=[(dtrain, 'train'), (dvalid, 'valid')], # Validation 성능을 측정할 수 있도록 설정 verbose_eval=200, early_stopping_rounds=100) # Validation 데이터 예측 val_preds = clf.predict(dvalid) # Validation index에 예측값 저장 y_oof[val_idx] = val_preds # 폴드별 Validation 스코어 출력 print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}") print('-' * 80) if WANDB_USE: wandb.log({"AUC": roc_auc_score(y_val, val_preds)}) # score 변수에 폴드별 평균 Validation 스코어 저장 score += roc_auc_score(y_val, val_preds) / folds # 테스트 데이터 예측하고 평균해서 저장 test_preds += clf.predict(xgb.DMatrix(x_test)) / folds # 폴드별 피처 중요도 저장 fi_tmp = pd.DataFrame.from_records([clf.get_score()]).T.reset_index() fi_tmp.columns = ['feature', f'fold_{fold+1}'] feature_importance = pd.merge(feature_importance, fi_tmp, on='feature') del x_tr, x_val, y_tr, y_val gc.collect() print(f"\nMean AUC = {score}") # 폴드별 평균 Validation 스코어 출력 print(f"OOF AUC = {roc_auc_score(y, y_oof)}" ) # Out Of Fold Validation 스코어 출력 # 평가 지표 출력 함수 print_score(y, y_oof, WANDB_USE) # 폴드별 피처 중요도 평균값 계산해서 저장 fi_cols = [col for col in feature_importance.columns if 'fold_' in col] feature_importance['importance'] = feature_importance[fi_cols].mean(axis=1) feature_importance.to_csv( f'/opt/ml/code/output/fi_xgb_{FEEATURE_FILE_NAME}.csv') return y_oof, test_preds
def cat(train, y, test, features, model_params, WANDB_USE, categorical_features=None, folds=10): if WANDB_USE: wandb.config.update(model_params) x_train = train[features] x_test = test[features] # 테스트 데이터 예측값을 저장할 변수 test_preds = np.zeros(x_test.shape[0]) # Out Of Fold Validation 예측 데이터를 저장할 변수 y_oof = np.zeros(x_train.shape[0]) # 폴드별 평균 Validation 스코어를 저장할 변수 score = 0 # 피처 중요도를 저장할 데이터 프레임 선언 feature_importance = pd.DataFrame() feature_importance['feature'] = features # Stratified K Fold 선언 skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED) for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)): # train index, validation index로 train 데이터를 나눔 x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features] y_tr, y_val = y[tr_idx], y[val_idx] print( f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}' ) # CatBoost 모델 훈련 clf = CatBoostClassifier(**model_params) clf.fit( x_tr, y_tr, eval_set=(x_val, y_val), # Validation 성능을 측정할 수 있도록 설정 cat_features=categorical_features, use_best_model=True, verbose=True) # Validation 데이터 예측 val_preds = clf.predict_proba(x_val)[:, 1] # Validation index에 예측값 저장 y_oof[val_idx] = val_preds # 폴드별 Validation 스코어 출력 print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}") print('-' * 80) if WANDB_USE: wandb.log({"AUC": roc_auc_score(y_val, val_preds)}) # score 변수에 폴드별 평균 Validation 스코어 저장 score += roc_auc_score(y_val, val_preds) / folds # 테스트 데이터 예측하고 평균해서 저장 test_preds += clf.predict_proba(x_test)[:, 1] / folds # 폴드별 피처 중요도 저장 feature_importance[f'fold_{fold+1}'] = clf.feature_importances_ del x_tr, x_val, y_tr, y_val gc.collect() print(f"\nMean AUC = {score}") # 폴드별 평균 Validation 스코어 출력 print(f"OOF AUC = {roc_auc_score(y, y_oof)}" ) # Out Of Fold Validation 스코어 출력 # 평가 지표 출력 함수 print_score(y, y_oof, WANDB_USE) # 폴드별 피처 중요도 평균값 계산해서 저장 fi_cols = [col for col in feature_importance.columns if 'fold_' in col] feature_importance['importance'] = feature_importance[fi_cols].mean(axis=1) feature_importance.to_csv( f'/opt/ml/code/output/fi_cat_{FEEATURE_FILE_NAME}.csv') return y_oof, test_preds
def make_cat_oof_prediction(train, y, test, features, categorical_features, folds=10): x_train = train[features] x_test = test[features] # 테스트 데이터 예측값을 저장할 변수 test_preds = np.zeros(x_test.shape[0]) # Out Of Fold Validation 예측 데이터를 저장할 변수 y_oof = np.zeros(x_train.shape[0]) # 폴드별 평균 Validation 스코어를 저장할 변수 score = 0 # 피처 중요도를 저장할 데이터 프레임 선언 fi = pd.DataFrame() fi['feature'] = features # Stratified K Fold 선언 skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=CFG.seed) for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)): # train index, validation index로 train 데이터를 나눔 x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features] y_tr, y_val = y[tr_idx], y[val_idx] print(f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}') # Catboost 모델 훈련 model = train_model(x_tr, y_tr, x_val, y_val,categorical_features) # Validation 데이터 예측 val_preds = np.array(model.predict_proba(x_val))[:,1] # Validation index에 예측값 저장 y_oof[val_idx] = val_preds # 폴드별 Validation 스코어 측정 print(f"Fold {fold + 1}") print_score(y_val, val_preds) # print(f"parameters : \n{model.get_all_params()}") print('-'*80) # score 변수에 폴드별 평균 Validation 스코어 저장 score += roc_auc_score(y_val, val_preds) / folds # 테스트 데이터 예측하고 평균해서 저장 test_preds += np.array(model.predict_proba(x_test))[:,1] / folds # 폴드별 피처 중요도 저장 fi[f'fold_{fold+1}'] = model.get_feature_importance() del x_tr, x_val, y_tr, y_val gc.collect() print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력 print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력 # ROC curve fpr, tpr, thresholds = roc_curve(y, y_oof) plt.plot(fpr, tpr, linewidth=2) plt.plot([0,1], [0,1], 'k--') plt.xlabel('FPR = 1 - TNR') plt.ylabel('TPR = Recall') plt.savefig(os.path.join(CFG.docs_path, 'ROC_curce_oof.png')) # 폴드별 피처 중요도 평균값 계산해서 저장 fi_cols = [col for col in fi.columns if 'fold_' in col] fi['importance'] = fi[fi_cols].mean(axis=1) return y_oof, test_preds, fi