Exemple #1
0
def knnTuning():
    train = getTrainingData('train.csv', visualize=False)
    X = train.drop(['Exited'], axis=1)
    sc = StandardScaler()
    X = sc.fit_transform(X)
    y = train.Exited
    # split training data half half
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    params = {
        "n_neighbors": list(range(5, 131, 2)),
        "weights": ['uniform', 'distance']
    }
    model = neighbors.KNeighborsClassifier()
    grid_search_cv = GridSearchCV(model,
                                  params,
                                  verbose=1,
                                  n_jobs=-1,
                                  cv=3,
                                  scoring='accuracy')
    # print(grid_search_cv.best_params_)
    grid_search_cv.fit(X_train, y_train)
    print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True)
    print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False)
    ROC(grid_search_cv, X_train, y_train, X_test, y_test, train=True)
    ROC(grid_search_cv, X_train, y_train, X_test, y_test, train=False)
    results = pd.DataFrame(grid_search_cv.cv_results_)
    printFullRow(results[results['rank_test_score'] == 1])
    # best param setting: n_neighbors == 11/13, p ==2, weights = distance
    return
Exemple #2
0
def main():
    # make world
    # the initialise_screen method is a bit weird
    s_length = width*(nrow+1)*3/4
    s_width = hex_utils.get_cross_width(width)*(nrow + 1)
    screen, background = initialise_screen(s_width, s_length, "Hex Snake")
    draw_grid(width, nrow, background)
    clock = pygame.time.Clock()
    score = 0

    # Make characters
    snake = HexSnake(8, 1)
    apple = move_apple(HexApple(), snake)

    # it checks for the tangle a little bit too late for length 2 snake
    while not snake.is_tangled():
        clock.tick(1)
        screen.blit(background, (0, 0))
        apple.draw(screen)
        snake.draw(screen)

        eaten = is_eating(snake, apple)
        # checks if snake touching apple and update next
        snake.move_snake(get_control(), eating=eaten)
        if eaten:
            score += 1
            move_apple(apple, snake)

        pygame.display.flip()

    while True:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
        print_score(score, screen)
Exemple #3
0
def adaboost(X_train, X_test, y_train, y_test):
    ab = AdaBoostClassifier(n_estimators=100, learning_rate=1, random_state=42)
    ab.fit(X_train, y_train)
    print_score(ab, X_train, y_train, X_test, y_test, train=True)
    print_score(ab, X_train, y_train, X_test, y_test, train=False)
    ROC(ab, X_train, y_train, X_test, y_test, train=True)
    ROC(ab, X_train, y_train, X_test, y_test, train=False)
Exemple #4
0
def lstm_test(sents, tags):
    model = LSTM()
    model.load_state_dict(torch.load("blstm.pkl"))
    tags = vec_flat(tags)
    tags_p = []
    for s in sents:
        out = model(torch.unsqueeze(a2ft(s), 0))
        out = out.data.numpy()
        for out_i in out:
            max_idx = np.argmax(out_i)
            tags_p.append(max_idx)
    utils.print_score(tags, tags_p)
    return utils.get_score(tags, tags_p)
def get_baseline(to_type='euler'):
	directory = DATA_DIR+to_type+'/valid/'

	actions_dict = {}

	for action in ACTIONS:
		cond_seq = __get_data([glob.glob(directory+action+'_*1-cond.npy')[0],
					glob.glob(directory+action+'_*2-cond.npy')[0]])
		gt_seq = __get_data([glob.glob(directory+action+'_*1-gt.npy')[0],
				glob.glob(directory+action+'_*2-gt.npy')[0]])
		actions_dict[action] = (cond_seq, gt_seq)

	# now, same as in
	# https://github.com/una-dinosauria/human-motion-prediction/blob/master/src/baselines.py#L184
	errs_constant_frame = __running_average(actions_dict, ACTIONS, 1, to_type)
	running_average_2 = __running_average(actions_dict, ACTIONS, 2, to_type)
	running_average_4 = __running_average(actions_dict, ACTIONS, 4, to_type)

	utils.print_score(errs_constant_frame, 'Zero-velocity (running avg. 1)', ACTIONS)
	print ''
	utils.print_score(running_average_2, 'Runnning avg. 2', ACTIONS)
	print ''
	utils.print_score(running_average_4, 'Runnning avg. 4', ACTIONS)
           strides=1,
           activation='relu',
           input_shape=image_shape))
net.add(Conv2D(64, kernel_size=4, strides=2, activation='relu'))
net.add(Dropout(0.5))
net.add(Conv2D(128, kernel_size=4, strides=1, activation='relu'))
net.add(Conv2D(128, kernel_size=4, strides=2, activation='relu'))
net.add(Dropout(0.5))
net.add(Conv2D(256, kernel_size=4, strides=1, activation='relu'))
net.add(Conv2D(256, kernel_size=4, strides=2, activation='relu'))
net.add(Flatten())
net.add(Dropout(0.5))
net.add(Dense(512, activation='relu'))
net.add(Dense(n_classes, activation='softmax'))
net.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=["accuracy"])

history = net.fit_generator(train_generator,
                            epochs=100,
                            steps_per_epoch=100,
                            verbose=1,
                            validation_data=val_generator)

print_score(net, train_generator, val_generator)

save_history(history, 'history_2.pk')

net.save('model_2.h5')

del net
def lgbm(train,
         y,
         test,
         features,
         model_params,
         WANDB_USE,
         categorical_features="auto",
         folds=10):
    if WANDB_USE:
        wandb.config.update(model_params)

    x_train = train[features]
    x_test = test[features]

    # 테스트 데이터 예측값을 저장할 변수
    test_preds = np.zeros(x_test.shape[0])

    # Out Of Fold Validation 예측 데이터를 저장할 변수
    y_oof = np.zeros(x_train.shape[0])

    # 폴드별 평균 Validation 스코어를 저장할 변수
    score = 0

    # 피처 중요도를 저장할 데이터 프레임 선언
    feature_importance = pd.DataFrame()
    feature_importance["feature"] = features

    # Stratified K Fold
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

    # TimeSeriesSplit
    # skf = TimeSeriesSplit(n_splits=24)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        # train index, validation index로 train 데이터를 나눔
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx,
                                                                 features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(
            f"fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}"
        )

        # LightGBM 데이터셋 선언
        dtrain = lgb.Dataset(x_tr, label=y_tr)
        dvalid = lgb.Dataset(x_val, label=y_val)

        # LightGBM 모델 훈련

        clf = lgb.train(
            model_params,
            dtrain,
            valid_sets=[dtrain, dvalid],  # Validation 성능을 측정할 수 있도록 설정
            categorical_feature=categorical_features,
            verbose_eval=200,
        )

        # Validation 데이터 예측
        val_preds = clf.predict(x_val)

        # Validation index에 예측값 저장
        y_oof[val_idx] = val_preds

        # 폴드별 Validation 스코어 측정
        print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}")
        print("-" * 80)
        if WANDB_USE:
            wandb.log({"AUC": roc_auc_score(y_val, val_preds)})

        # score 변수에 폴드별 평균 Validation 스코어 저장
        score += roc_auc_score(y_val, val_preds) / folds

        # 테스트 데이터 예측하고 평균해서 저장
        test_preds += clf.predict(x_test) / folds

        # 폴드별 피처 중요도 저장
        feature_importance[f"fold_{fold+1}"] = clf.feature_importance()

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    # 폴드별 Validation 스코어 출력 & Out Of Fold Validation 스코어 출력
    print(f"\nMean AUC = {score}")
    print(f"OOF AUC = {roc_auc_score(y, y_oof)}")

    # 평가 지표 출력 함수
    print_score(y, y_oof, WANDB_USE)

    # 폴드별 피처 중요도 평균값 계산해서 저장
    fi_cols = [col for col in feature_importance.columns if "fold_" in col]
    feature_importance["importance"] = feature_importance[fi_cols].mean(axis=1)

    # feature 중요도 출력
    print(feature_importance)
    feature_importance.to_csv(
        f'/opt/ml/code/output/fi_lgbm_{FEEATURE_FILE_NAME}.csv')
    # plot_feature_importances(feature_importance)
    # plot_roc_curve(y, y_oof)

    return y_oof, test_preds
def xgboost(train, y, test, features, model_params, WANDB_USE, folds=10):
    if WANDB_USE:
        wandb.config.update(model_params)

    x_train = train[features]
    x_test = test[features]

    # 테스트 데이터 예측값을 저장할 변수
    test_preds = np.zeros(x_test.shape[0])

    # Out Of Fold Validation 예측 데이터를 저장할 변수
    y_oof = np.zeros(x_train.shape[0])

    # 폴드별 평균 Validation 스코어를 저장할 변수
    score = 0

    # 피처 중요도를 저장할 데이터 프레임 선언
    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    # Stratified K Fold 선언
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        # train index, validation index로 train 데이터를 나눔
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx,
                                                                 features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(
            f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}'
        )

        # XGBoost 데이터셋 선언
        dtrain = xgb.DMatrix(x_tr, label=y_tr)
        dvalid = xgb.DMatrix(x_val, label=y_val)

        # XGBoost 모델 훈련
        clf = xgb.train(
            model_params,
            dtrain,
            num_boost_round=10000,  # 트리 개수
            evals=[(dtrain, 'train'),
                   (dvalid, 'valid')],  # Validation 성능을 측정할 수 있도록 설정
            verbose_eval=200,
            early_stopping_rounds=100)

        # Validation 데이터 예측
        val_preds = clf.predict(dvalid)

        # Validation index에 예측값 저장
        y_oof[val_idx] = val_preds

        # 폴드별 Validation 스코어 출력
        print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}")
        print('-' * 80)
        if WANDB_USE:
            wandb.log({"AUC": roc_auc_score(y_val, val_preds)})

        # score 변수에 폴드별 평균 Validation 스코어 저장
        score += roc_auc_score(y_val, val_preds) / folds

        # 테스트 데이터 예측하고 평균해서 저장
        test_preds += clf.predict(xgb.DMatrix(x_test)) / folds

        # 폴드별 피처 중요도 저장
        fi_tmp = pd.DataFrame.from_records([clf.get_score()]).T.reset_index()
        fi_tmp.columns = ['feature', f'fold_{fold+1}']
        feature_importance = pd.merge(feature_importance, fi_tmp, on='feature')

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean AUC = {score}")  # 폴드별 평균 Validation 스코어 출력
    print(f"OOF AUC = {roc_auc_score(y, y_oof)}"
          )  # Out Of Fold Validation 스코어 출력

    # 평가 지표 출력 함수
    print_score(y, y_oof, WANDB_USE)

    # 폴드별 피처 중요도 평균값 계산해서 저장
    fi_cols = [col for col in feature_importance.columns if 'fold_' in col]
    feature_importance['importance'] = feature_importance[fi_cols].mean(axis=1)
    feature_importance.to_csv(
        f'/opt/ml/code/output/fi_xgb_{FEEATURE_FILE_NAME}.csv')

    return y_oof, test_preds
def cat(train,
        y,
        test,
        features,
        model_params,
        WANDB_USE,
        categorical_features=None,
        folds=10):
    if WANDB_USE:
        wandb.config.update(model_params)

    x_train = train[features]
    x_test = test[features]

    # 테스트 데이터 예측값을 저장할 변수
    test_preds = np.zeros(x_test.shape[0])

    # Out Of Fold Validation 예측 데이터를 저장할 변수
    y_oof = np.zeros(x_train.shape[0])

    # 폴드별 평균 Validation 스코어를 저장할 변수
    score = 0

    # 피처 중요도를 저장할 데이터 프레임 선언
    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    # Stratified K Fold 선언
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        # train index, validation index로 train 데이터를 나눔
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx,
                                                                 features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(
            f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}'
        )

        # CatBoost 모델 훈련
        clf = CatBoostClassifier(**model_params)
        clf.fit(
            x_tr,
            y_tr,
            eval_set=(x_val, y_val),  # Validation 성능을 측정할 수 있도록 설정
            cat_features=categorical_features,
            use_best_model=True,
            verbose=True)

        # Validation 데이터 예측
        val_preds = clf.predict_proba(x_val)[:, 1]

        # Validation index에 예측값 저장
        y_oof[val_idx] = val_preds

        # 폴드별 Validation 스코어 출력
        print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}")
        print('-' * 80)
        if WANDB_USE:
            wandb.log({"AUC": roc_auc_score(y_val, val_preds)})

        # score 변수에 폴드별 평균 Validation 스코어 저장
        score += roc_auc_score(y_val, val_preds) / folds

        # 테스트 데이터 예측하고 평균해서 저장
        test_preds += clf.predict_proba(x_test)[:, 1] / folds

        # 폴드별 피처 중요도 저장
        feature_importance[f'fold_{fold+1}'] = clf.feature_importances_

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean AUC = {score}")  # 폴드별 평균 Validation 스코어 출력
    print(f"OOF AUC = {roc_auc_score(y, y_oof)}"
          )  # Out Of Fold Validation 스코어 출력

    # 평가 지표 출력 함수
    print_score(y, y_oof, WANDB_USE)

    # 폴드별 피처 중요도 평균값 계산해서 저장
    fi_cols = [col for col in feature_importance.columns if 'fold_' in col]
    feature_importance['importance'] = feature_importance[fi_cols].mean(axis=1)
    feature_importance.to_csv(
        f'/opt/ml/code/output/fi_cat_{FEEATURE_FILE_NAME}.csv')

    return y_oof, test_preds
def make_cat_oof_prediction(train, y, test, features, categorical_features, folds=10):
    x_train = train[features]
    x_test = test[features]
    
    # 테스트 데이터 예측값을 저장할 변수
    test_preds = np.zeros(x_test.shape[0])
    
    # Out Of Fold Validation 예측 데이터를 저장할 변수
    y_oof = np.zeros(x_train.shape[0])
    
    # 폴드별 평균 Validation 스코어를 저장할 변수
    score = 0
    
    # 피처 중요도를 저장할 데이터 프레임 선언
    fi = pd.DataFrame()
    fi['feature'] = features
    
    # Stratified K Fold 선언
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=CFG.seed)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        # train index, validation index로 train 데이터를 나눔
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        print(f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}')
        
        # Catboost 모델 훈련
        model = train_model(x_tr, y_tr, x_val, y_val,categorical_features)

        # Validation 데이터 예측
        val_preds = np.array(model.predict_proba(x_val))[:,1]
        
        # Validation index에 예측값 저장 
        y_oof[val_idx] = val_preds
        
        # 폴드별 Validation 스코어 측정
        print(f"Fold {fold + 1}")
        print_score(y_val, val_preds)
        # print(f"parameters : \n{model.get_all_params()}")
        print('-'*80)

        # score 변수에 폴드별 평균 Validation 스코어 저장
        score += roc_auc_score(y_val, val_preds) / folds
        
        # 테스트 데이터 예측하고 평균해서 저장
        test_preds += np.array(model.predict_proba(x_test))[:,1] / folds
        
        # 폴드별 피처 중요도 저장
        fi[f'fold_{fold+1}'] = model.get_feature_importance()


        del x_tr, x_val, y_tr, y_val
        gc.collect()
        
    print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력
    print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력

    # ROC curve
    fpr, tpr, thresholds = roc_curve(y, y_oof)

    plt.plot(fpr, tpr, linewidth=2)
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel('FPR = 1 - TNR')
    plt.ylabel('TPR = Recall')
    plt.savefig(os.path.join(CFG.docs_path, 'ROC_curce_oof.png'))
        
    # 폴드별 피처 중요도 평균값 계산해서 저장 
    fi_cols = [col for col in fi.columns if 'fold_' in col]
    fi['importance'] = fi[fi_cols].mean(axis=1)

    return y_oof, test_preds, fi