probability=True, gamma=0.646508 + .3, random_state=23)) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True) print 'CV:', scores, 'Mean log loss:', np.mean(scores) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels)) fitted_classifier = calibrated_classifier.fit(train, labels) predictions = fitted_classifier.predict_proba(test) utils.save_submission( consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': score = utils.hold_out_evaluation(clf, train, labels, calibrate=False, test_size=0.9) print 'Log loss:', score elif MODE == 'tune': train, labels, valid, valid_labels = utils.stratified_split(train, labels,
test = feat_selector.transform(test) print train.shape # encode labels lbl_enc = preprocessing.LabelEncoder() labels = lbl_enc.fit_transform(labels) # train classifier clf = ensemble.ExtraTreesClassifier(n_jobs=3, n_estimators=600, max_features=20, min_samples_split=3, bootstrap=False, verbose=3, random_state=23) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True) print 'CV:', scores, 'Mean log loss:', np.mean(scores) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels)) fitted_classifier = calibrated_classifier.fit(train, labels) predictions = fitted_classifier.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': score = utils.hold_out_evaluation(clf, train, labels, calibrate=True) print 'Log loss:', score else: print 'Unknown mode'
# transform counts to TFIDF features tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) train = tfidf.fit_transform(train).toarray() test = tfidf.transform(test).toarray() # encode labels lbl_enc = preprocessing.LabelEncoder() labels = lbl_enc.fit_transform(labels) # train classifier clf = ensemble.ExtraTreesClassifier(n_jobs=5, n_estimators=600, max_features=20, min_samples_split=3, bootstrap=False, verbose=3, random_state=23) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True) print( 'CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels)) fitted_classifier = calibrated_classifier.fit(train, labels) predictions = fitted_classifier.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': score = utils.hold_out_evaluation(clf, train, labels, calibrate=True) print( 'Log loss:', score) else: print( 'Unknown mode')