def main(n_iter, n_folds, smodels, n_jobs=None, stack=0, use_vote=0, gnrl='KNC', modsel=0, rfe=0, psearch=0, starter=0, verbose=0, submit=0): y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames() X_all = np.arange(n_all) models = [] for m in smodels.split('+'): models.append( eval('Model%02d()'%int(m)) ) #models = (Model02(),Model12(),Model10(),) # *** logger.debug("models:%s", models) X = X_all[:n_train] logger.info('Find params for models') for model in models: model.set_params(**find_params(model, X, y, scoring='roc_auc', n_iter=n_iter, n_jobs=n_jobs, random_state=random_state+1, psearch=psearch) ) rd = ModelStack(models,gnrl=gnrl,stack=stack, use_vote=use_vote, modsel=modsel,rfe=rfe) if starter: logger.info('Starters start') rd.starter() if psearch > 1 and len(models)==1: # update current model best score y_pred, scores = cv_run(rd, X, y, n_folds=n_folds, n_iter=n_iter, n_jobs=n_jobs, random_state=random_state+2) update_params_best_score(models[0], np.mean(scores)) return elif not submit: logger.debug('Cross validation starts') y_pred, scores = cv_run(rd, X, y, n_folds=n_folds, n_iter=n_iter, n_jobs=n_jobs, random_state=random_state) prepare.Prepare_0().dump_ypred_residuals(y,y_pred) if verbose > 1: plot_errors(X,y,y_pred) if stack: logger.info("Mean Coefs: %s", rd.mean_coefs()) return else: logger.info("Prepare submission..") logger.info("training on full data") rd.fit(X_all[:n_train],y) Xtest = X_all[n_train:] pred = rd.predict_proba(Xtest)[:,1] import submit submit.do_submit(pred)
def fit(self, Xmask, y): pr = prepare.Prepare_0(model=13, n_components=128, min_df=3, preproc=0, use_svd=True, tfidf=2, stemmer=0) (X_all_df, _, BP, params) = pr.load_transform(update=False) names = list(X_all_df.columns) X_all = np.asarray(X_all_df) self.X_all, self.names = X_all, names clf = lm.LogisticRegression(penalty='l1', dual=False, tol=0.00001, C=0.05, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state) self.rd = Pipeline([ ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)), #("scaler",StandardScaler(with_mean=False)), ("scaler", StandardScaler(with_mean=True)), ("est", clf) ]) self.rd.fit(Xmask, np.asarray(y)) return self
def fit(self, Xmask, y): pr = prepare.Prepare_0(model=10, preproc=1, min_df=1, use_svd=False, tfidf=2, stemmer=0) (X_all_df, _, BP, params) = pr.load_transform(update=False) names = list(X_all_df.columns) X_all = np.asarray(X_all_df) self.X_all, self.names = X_all, names clf0 = GaussianNB() clf1 = MultinomialNB(alpha=0.8) clf2 = BernoulliNB(alpha=1, binarize=0.01) clf = clf1 self.rd = Pipeline([ ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)), #("scaler",StandardScaler(with_mean=False)), ("est", clf) ]) self.rd.fit(Xmask, np.asarray(y)) return self
def _get_featureset(self): return prepare.Prepare_0(model=14, n_components=512, preproc=1, min_df=1, use_svd=True, tfidf=2, stemmer=0)
def main(submit=0): Xall_df,y = prepare.Prepare_0().load(preproc=0, update=False) #Xall_df,y = Xall_df.iloc[:500,:],y[:300] lentrain = len(y) Xtrain_df = Xall_df.iloc[:lentrain,:] clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state) clf2 = RandomForestClassifier(n_estimators=200, max_depth=24, n_jobs=-1, random_state=random_state, verbose=0) clf3 = GradientBoostingClassifier(n_estimators=42, max_depth=24, random_state=random_state, verbose=2, subsample=0.9) clf4 = svm.SVC(probability=True) clf5 = KNeighborsClassifier(n_neighbors=5) clf6 = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=random_state, rho=None, shuffle=False, verbose=0, warm_start=False) clf = clf1 if 0: selector = RFECVp(clf,clf, step=10, cv=4, scoring="roc_auc", verbose=2) selector = selector.fit( Transformer().fit_transform(Xtrain_df, y), y) clf = selector rd = Pipeline([ ("trans", Transformer()), #("selector", SelectPercentile(chi2, percentile=90)), #("selector", SelectPercentile(f_classif, percentile=50)), #("selector", lm.RandomizedLogisticRegression(C=1, random_state=random_state, verbose=1)), #("pca", PCA(n_components='mle')), #("pca", PCA(n_components=500)), #("svd", TruncatedSVD(n_components=200, random_state=random_state )), #("lasso",svm.LinearSVC(C=0.5, penalty="l1", dual=False)), ("est", clf) ]) if not submit: cv_run(rd, Xtrain_df, y) return else: print "Prepare submission.." print "training on full data" rd.fit(Xtrain_df,y) Xtest_df = Xall_df.iloc[lentrain:,:] pred = rd.predict_proba(Xtest_df)[:,1] import submit submit.do_submit(pred)
def fit(self, Xmask, y): pr = prepare.Prepare_0(model=14, n_components=512, preproc=1, min_df=1, use_svd=True, tfidf=2, stemmer=0) (X_all_df, _, BP, params) = pr.load_transform(update=False) names = list(X_all_df.columns) X_all = np.asarray(X_all_df) self.X_all, self.names = X_all, names clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.00001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state) class LassoCV_proba(lm.LassoCV): def predict_proba(self, X): print 'alpha_:', self.alpha_ y = self.predict(X) y = 1. / (1 + np.exp(-(y - 0.5))) return np.vstack((1 - y, y)).T class RidgeCV_proba(lm.RidgeCV): def predict_proba(self, X): print 'alpha_:', self.alpha_ y = self.predict(X) if 0: y_min, y_max = y.min(), y.max() if y_max > y_min: y = (y - y_min) / (y_max - y_min) else: y = 1. / (1 + np.exp(-(y - 0.5))) return np.vstack((1 - y, y)).T clf2 = RidgeCV_proba(alphas=np.linspace(0, 10), cv=4) clf3 = LassoCV_proba(alphas=None, cv=4) clf4 = svm.SVR(C=3, kernel='linear') clf = clf1 self.rd = Pipeline([ ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)), #("scaler",StandardScaler(with_mean=False)), #("filter",lm.LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state)), ("est", clf) ]) self.rd.fit(Xmask, np.asarray(y)) return self
def test(): y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames() X_all = np.arange(n_all) X = X_all[:200] y = y[:200] rd = Model09() rd.starter() cv_run(rd, X, y) print "tests ok"
def get_model04_data(): fname = '../data/model04_data' print "get %s" % fname try: (X_all, names) = joblib.load(fname) except: X_all_df, y = prepare.Prepare_0().load(preproc=0, update=False) names = list(X_all_df.columns) X_all = np.asarray(X_all_df) joblib.dump((X_all, names), fname) return X_all, names
def test(): Xall_df,y = prepare.Prepare_0().load() Xall_df,y = Xall_df.iloc[:400,:],y[:200] lentrain = len(y) clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state) clf5 = KNeighborsClassifier(n_neighbors=5) clf= clf5 rd = Pipeline([ ("trans", Transformer()), ("est", clf) ]) cv_run(rd, Xall_df.iloc[:lentrain,:], y) print "tests ok"
def _get_featureset(self): extra_par = dict(ngram_max=self.ngram_max, max_df=self.max_df, binary=self.binary, max_features=self.max_features, use_idf=self.use_idf, smooth_idf=self.smooth_idf, sublinear_tf=self.sublinear_tf, norm=self.norm, token_min=self.token_min, do_remove_stopwords=self.do_remove_stopwords) extra_js = json.dumps(extra_par) return prepare.Prepare_0(n_components=self.n_components, preproc=self.preproc, min_df=self.min_df, use_svd=self.use_svd, tfidf=self.tfidf, stemmer=self.stemmer, fit_area=self.fit_area, extra=extra_js)
def main(submit=0): y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames() X_all = np.arange(n_all) X = X_all[:n_train] rd = Model09() rd.starter() if not submit: cv_run(rd, X, y) return else: print "Prepare submission.." print "training on full data" rd.fit(X_all[:n_train], y) Xtest = X_all[n_train:] pred = rd.predict_proba(Xtest)[:, 1] import submit submit.do_submit(pred)
def fit(self, Xmask, y): X_all_df, _ = prepare.Prepare_0(model=4).load(preproc=0, update=False) names = list(X_all_df.columns) X_all = np.asarray(X_all_df) self.X_all, self.names = X_all, names clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state) self.rd = Pipeline([("trans", Transformer(names=self.names)), ("scaler", StandardScaler(with_mean=False)), ("est", clf)]) self.rd.fit(m(self.X_all, Xmask), np.asarray(y)) return self