def objective(args): max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight, row_subsample=row_subsample, min_loss_reduction=min_loss_reduction, column_subsample=column_subsample, verbose=False) score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) print 'max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss' print args, score return score
def objective(args): max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight, row_subsample=row_subsample, min_loss_reduction=min_loss_reduction, column_subsample=column_subsample, verbose=False) score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) print ('max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss') print (args, score) return score
def objective(args): c, gamma = args clf = OneVsRestClassifier(svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma, probability=True, random_state=23)) score1 = 0 score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False) score = log_loss(valid_labels, clf.predict_proba(valid)) print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (c, gamma, score1, score2, score) return score
def objective(args): c, gamma = args clf = OneVsRestClassifier( svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma, probability=True, random_state=23)) score1 = 0 score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False) score = log_loss(valid_labels, clf.predict_proba(valid)) print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % ( c, gamma, score1, score2, score) return score
row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804, column_subsample=.730128689911957, step_size=.009) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) print ('CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': clf.fit(train, labels) predictions = clf.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': train, labels, _, _ = utils.stratified_split(train, labels, test_size=.7) score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) print ('Log loss:', score) elif MODE == 'tune': # Objective function def objective(args): max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight, row_subsample=row_subsample, min_loss_reduction=min_loss_reduction, column_subsample=column_subsample, verbose=False) score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) print ('max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss') print (args, score) return score # Searching space space = ( hp.quniform('max_depth', 2, 14, 1),
train, labels, test, _, _ = utils.load_data() # transform counts to TFIDF features tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) train = tfidf.fit_transform(train).toarray() test = tfidf.transform(test).toarray() # encode labels lbl_enc = preprocessing.LabelEncoder() labels = lbl_enc.fit_transform(labels) # train classifier clf = ensemble.ExtraTreesClassifier(n_jobs=4, n_estimators=2000, max_features=20, min_samples_split=3, bootstrap=False, verbose=3, random_state=23) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) print('CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': clf.fit(train, labels) predictions = clf.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) print('Log loss:', score) else: print('Unknown mode')
print (train.shape) # encode labels lbl_enc = preprocessing.LabelEncoder() labels = lbl_enc.fit_transform(labels) # train classifier linear_clf = linear_model.LogisticRegression(C=1, penalty='l1', fit_intercept=True, random_state=23) clf = ensemble.BaggingClassifier(base_estimator=linear_clf, n_estimators=40, max_samples=1., max_features=1., bootstrap=True, n_jobs=5, verbose=True, random_state=23) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels) print ('CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': clf.fit(train, labels) predictions = clf.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': score = utils.hold_out_evaluation(clf, train, labels) print ('Log loss:', score) else: print ('Unknown mode')
print 'CV:', scores, 'Mean log loss:', np.mean(scores) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels)) fitted_classifier = calibrated_classifier.fit(train, labels) predictions = fitted_classifier.predict_proba(test) utils.save_submission( consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': score = utils.hold_out_evaluation(clf, train, labels, calibrate=False, test_size=0.9) print 'Log loss:', score elif MODE == 'tune': train, labels, valid, valid_labels = utils.stratified_split(train, labels, test_size=.8) from sklearn.metrics import log_loss # Objective function def objective(args): c, gamma = args clf = OneVsRestClassifier( svm.SVC(C=c, kernel='rbf',
clf = OneVsRestClassifier(svm.SVC(C=4.919646+2., kernel='rbf', tol=.001, verbose=True, probability=True, gamma=0.646508+.3, random_state=23)) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True) print 'CV:', scores, 'Mean log loss:', np.mean(scores) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels)) fitted_classifier = calibrated_classifier.fit(train, labels) predictions = fitted_classifier.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': score = utils.hold_out_evaluation(clf, train, labels, calibrate=False, test_size=0.9) print 'Log loss:', score elif MODE == 'tune': train, labels, valid, valid_labels = utils.stratified_split(train, labels, test_size=.8) from sklearn.metrics import log_loss # Objective function def objective(args): c, gamma = args clf = OneVsRestClassifier(svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma, probability=True, random_state=23)) score1 = 0 score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False) score = log_loss(valid_labels, clf.predict_proba(valid)) print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (c, gamma, score1, score2, score) return score # Searching space