def main(submit=0): X_all_df,y = prepare.Prepare_0(model=4).load(preproc=0, update=False) names = list(X_all_df.columns) #X_all_df,y = X_all_df.iloc[:500,:],y[:300] lentrain = len(y) Xtrain_df = X_all_df.iloc[:lentrain,:] clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state) clf2 = RandomForestClassifier(n_estimators=200, max_depth=24, n_jobs=-1, random_state=random_state, verbose=0) clf3 = GradientBoostingClassifier(n_estimators=42, max_depth=24, random_state=random_state, verbose=2, subsample=0.9) clf4 = svm.SVC(probability=True) clf5 = KNeighborsClassifier(n_neighbors=5) clf6 = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=random_state, rho=None, shuffle=False, verbose=0, warm_start=False) clf = clf1 if 0: clf = RFECVp(clf,clf, step=4, cv=4, scoring="roc_auc", verbose=2) rd = Pipeline([ ("trans", Transformer(names=names)), #("scaler",StandardScaler()), ("scaler",StandardScaler(with_mean=False)), #("selector", SelectPercentile(chi2, percentile=50)), #("selector", SelectPercentile(f_classif, percentile=50)), #("selector", lm.RandomizedLogisticRegression(C=1, random_state=random_state, verbose=1)), #("pca", PCA(n_components='mle')), #("pca", PCA(n_components=500)), #("svd", TruncatedSVD(n_components=50, random_state=random_state )), #("lasso",svm.LinearSVC(C=0.5, penalty="l1", dual=False)), ("est", clf) ]) if not submit: cv_run(rd, Xtrain_df, y) return else: print "Prepare submission.." print "training on full data" rd.fit(Xtrain_df,y) Xtest_df = X_all_df.iloc[lentrain:,:] pred = rd.predict_proba(Xtest_df)[:,1] import submit submit.do_submit(pred)
def main(submit=0): Xall_df,y = prepare.Prepare_0().load(preproc=0, update=False) #Xall_df,y = Xall_df.iloc[:500,:],y[:300] lentrain = len(y) Xtrain_df = Xall_df.iloc[:lentrain,:] clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state) clf2 = RandomForestClassifier(n_estimators=200, max_depth=24, n_jobs=-1, random_state=random_state, verbose=0) clf3 = GradientBoostingClassifier(n_estimators=42, max_depth=24, random_state=random_state, verbose=2, subsample=0.9) clf4 = svm.SVC(probability=True) clf5 = KNeighborsClassifier(n_neighbors=5) clf6 = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=random_state, rho=None, shuffle=False, verbose=0, warm_start=False) clf = clf1 if 0: selector = RFECVp(clf,clf, step=10, cv=4, scoring="roc_auc", verbose=2) selector = selector.fit( Transformer().fit_transform(Xtrain_df, y), y) clf = selector rd = Pipeline([ ("trans", Transformer()), #("selector", SelectPercentile(chi2, percentile=90)), #("selector", SelectPercentile(f_classif, percentile=50)), #("selector", lm.RandomizedLogisticRegression(C=1, random_state=random_state, verbose=1)), #("pca", PCA(n_components='mle')), #("pca", PCA(n_components=500)), #("svd", TruncatedSVD(n_components=200, random_state=random_state )), #("lasso",svm.LinearSVC(C=0.5, penalty="l1", dual=False)), ("est", clf) ]) if not submit: cv_run(rd, Xtrain_df, y) return else: print "Prepare submission.." print "training on full data" rd.fit(Xtrain_df,y) Xtest_df = Xall_df.iloc[lentrain:,:] pred = rd.predict_proba(Xtest_df)[:,1] import submit submit.do_submit(pred)
def main(n_iter, n_folds, smodels, n_jobs=None, stack=0, use_vote=0, gnrl='KNC', modsel=0, rfe=0, psearch=0, starter=0, verbose=0, submit=0): y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames() X_all = np.arange(n_all) models = [] for m in smodels.split('+'): models.append( eval('Model%02d()'%int(m)) ) #models = (Model02(),Model12(),Model10(),) # *** logger.debug("models:%s", models) X = X_all[:n_train] logger.info('Find params for models') for model in models: model.set_params(**find_params(model, X, y, scoring='roc_auc', n_iter=n_iter, n_jobs=n_jobs, random_state=random_state+1, psearch=psearch) ) rd = ModelStack(models,gnrl=gnrl,stack=stack, use_vote=use_vote, modsel=modsel,rfe=rfe) if starter: logger.info('Starters start') rd.starter() if psearch > 1 and len(models)==1: # update current model best score y_pred, scores = cv_run(rd, X, y, n_folds=n_folds, n_iter=n_iter, n_jobs=n_jobs, random_state=random_state+2) update_params_best_score(models[0], np.mean(scores)) return elif not submit: logger.debug('Cross validation starts') y_pred, scores = cv_run(rd, X, y, n_folds=n_folds, n_iter=n_iter, n_jobs=n_jobs, random_state=random_state) prepare.Prepare_0().dump_ypred_residuals(y,y_pred) if verbose > 1: plot_errors(X,y,y_pred) if stack: logger.info("Mean Coefs: %s", rd.mean_coefs()) return else: logger.info("Prepare submission..") logger.info("training on full data") rd.fit(X_all[:n_train],y) Xtest = X_all[n_train:] pred = rd.predict_proba(Xtest)[:,1] import submit submit.do_submit(pred)
def main(submit=0): y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames() X_all = np.arange(n_all) X = X_all[:n_train] rd = Model14() rd.starter() if not submit: cv_run(rd, X, y) return else: print "Prepare submission.." print "training on full data" rd.fit(X_all[:n_train],y) Xtest = X_all[n_train:] pred = rd.predict_proba(Xtest)[:,1] import submit submit.do_submit(pred)
def main(submit=0): y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames() X_all = np.arange(n_all) X = X_all[:n_train] rd = Model09() rd.starter() if not submit: cv_run(rd, X, y) return else: print "Prepare submission.." print "training on full data" rd.fit(X_all[:n_train], y) Xtest = X_all[n_train:] pred = rd.predict_proba(Xtest)[:, 1] import submit submit.do_submit(pred)