def main() : print "Loading data..." y = utility.load_truth() print "Loading indexing..." Xts = None xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir) if not os.path.exists(xtsfn) : X_train = utility.load_encoded('train') nfeat = X_train.shape[1] Xts = [utility.OneHotEncoder(X_train[:,[i]])[0] for i in range(nfeat)] pickle.dump(Xts, open(xtsfn, "w")) else : Xts = pickle.load(open(xtsfn)) mlist = [] for featfn in os.listdir(utility.ddir) : if not (featfn.startswith('feateng') and featfn.endswith('dat')) : continue modelstr = os.path.splitext(featfn)[0] featfn = utility.ddir + '/' + featfn paramfn = featfn.replace('.dat', '_bestc.txt') if os.path.exists(paramfn) : continue print modelstr features = np.load(featfn) bestC = get_best_hyperparam(features, Xts, y) ofile = open(paramfn, 'w') ofile.write('{}\n'.format(bestC)) ofile.close()
def main(): print "Loading data..." y = utility.load_truth() print "Loading indexing..." Xts = None xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir) if not os.path.exists(xtsfn): X_train = utility.load_encoded('train') nfeat = X_train.shape[1] Xts = [utility.OneHotEncoder(X_train[:, [i]])[0] for i in range(nfeat)] pickle.dump(Xts, open(xtsfn, "w")) else: Xts = pickle.load(open(xtsfn)) mlist = [] for featfn in os.listdir(utility.ddir): if not (featfn.startswith('feateng') and featfn.endswith('dat')): continue modelstr = os.path.splitext(featfn)[0] featfn = utility.ddir + '/' + featfn paramfn = featfn.replace('.dat', '_bestc.txt') if os.path.exists(paramfn): continue print modelstr features = np.load(featfn) bestC = get_best_hyperparam(features, Xts, y) ofile = open(paramfn, 'w') ofile.write('{}\n'.format(bestC)) ofile.close()
def main() : print "Loading data..." X_train = utility.load_encoded('train') X_test = utility.load_encoded('test') y = utility.load_truth() mlist = [] for featfn in os.listdir(utility.ddir) : if not (featfn.startswith('feateng') and featfn.endswith('dat')) : continue modelstr = os.path.splitext(featfn)[0] featfn = utility.ddir + '/' + featfn if not os.path.exists(featfn) : continue paramfn = featfn.replace('.dat', '_bestc.txt') if not os.path.exists(paramfn) : continue mlist.append(modelstr) print "Generating level1 test data..." X_level1_test = None X_level1_testfn = utility.ddir + '/fullmodel_precombined.dat' if os.path.exists(X_level1_testfn) : X_level1_test = np.load(X_level1_testfn) else : X_level1_test = generate_level1_test(mlist, X_train, X_test, y) X_level1_test.dump(X_level1_testfn) print "Writing submissions..." weightfn = 'logreg_level1weights_rev{}.dat'.format(utility.logregrev) weights = np.load(utility.ddir + '/' + weightfn) final_submission = fopt_pred(weights, X_level1_test) utility.create_test_submission( 'logreg_stacked_preds_rev{}.csv'.format(utility.logregrev), np.ravel(final_submission)) print "Getting gbm trained models..." gbrfn = '{}/gbr_nest1000.csv'.format(utility.subdir) gbmone = np.array(pd.io.parsers.read_csv(gbrfn)['Action']) x_level1_test = np.transpose(np.vstack((X_level1_test.T, gbmone))) lgwfn = 'logreg_level1weights_linreg_rev{}.dat'.format(utility.logregrev) lgweights = np.load(utility.ddir + '/' + lgwfn) final_linregsubmission = fopt_pred(lgweights, X_level1_test) utility.create_test_submission( 'logreg_stacked_preds_linreg_rev{}.csv'.format(utility.logregrev), np.ravel(final_linregsubmission)) avgweights = np.ones(len(lgweights)) / float(len(lgweights)) final_avgsubmission = fopt_pred(avgweights, X_level1_test) utility.create_test_submission( 'logreg_stacked_preds_avg_rev{}.csv'.format(utility.logregrev), np.ravel(final_avgsubmission))
def main(): level1fn = 'logreg_level1data_rev{}.dat'.format(utility.logregrev) X_train = np.load(utility.ddir + '/' + level1fn) y = np.array(utility.load_truth(), dtype=np.float64) weightfn = 'logreg_level1weights_rev{}.dat'.format(utility.logregrev) weights = train_level1(X_train, y) weights.dump(utility.ddir + '/' + weightfn) lgwfn = 'logreg_level1weights_linreg_rev{}.dat'.format(utility.logregrev) lgweights = train_level1_linreg(X_train, y) lgweights.dump(utility.ddir + '/' + lgwfn)
def main() : level1fn = 'logreg_level1data_rev{}.dat'.format(utility.logregrev) X_train = np.load(utility.ddir + '/' + level1fn) y = np.array(utility.load_truth(), dtype=np.float64) weightfn = 'logreg_level1weights_rev{}.dat'.format(utility.logregrev) weights = train_level1(X_train, y) weights.dump(utility.ddir + '/' + weightfn) lgwfn = 'logreg_level1weights_linreg_rev{}.dat'.format(utility.logregrev) lgweights = train_level1_linreg(X_train, y) lgweights.dump(utility.ddir + '/' + lgwfn)
def main() : X_train = utility.load_encoded('train') X_test = utility.load_encoded('test') y = utility.load_truth() xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir) if not os.path.exists(xtsfn) : Xts = [utility.OneHotEncoder(X_train[:,[i]])[0] for i in range(nfeat)] pickle.dump(Xts, open(xtsfn, "w")) else : Xts = pickle.load(open(xtsfn)) for iseed in range(6) : forward_feature_selection(iseed, X_train, X_test, y, Xts)
def main(): print "Loading data..." y = utility.load_truth() print "Loading indexing..." xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir) if not os.path.exists(xtsfn): X_train = utility.load_encoded('train') nfeat = X_train.shape[1] Xts = [utility.OneHotEncoder(X_train[:, [i]])[0] for i in range(nfeat)] pickle.dump(Xts, open(xtsfn, "w")) else: Xts = pickle.load(open(xtsfn)) for iseed in range(5): seedfn = '{}/feateng_forward_seed{}.dat'.format(utility.ddir, iseed) if not os.path.exists(seedfn): forward_feature_selection(iseed, Xts, y)
def main() : print "Loading data..." y = utility.load_truth() print "Loading indexing..." xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir) if not os.path.exists(xtsfn) : X_train = utility.load_encoded('train') nfeat = X_train.shape[1] Xts = [utility.OneHotEncoder(X_train[:,[i]])[0] for i in range(nfeat)] pickle.dump(Xts, open(xtsfn, "w")) else : Xts = pickle.load(open(xtsfn)) for iseed in range(5) : seedfn = '{}/feateng_forward_seed{}.dat'.format(utility.ddir, iseed) if not os.path.exists(seedfn) : forward_feature_selection(iseed, Xts, y)
def main(): print "Loading data..." X_train = utility.load_encoded('train') y = utility.load_truth() mlist = [] for featfn in os.listdir(utility.ddir): if not (featfn.startswith('feateng') and featfn.endswith('dat')): continue modelstr = os.path.splitext(featfn)[0] featfn = utility.ddir + '/' + featfn paramfn = featfn.replace('.dat', '_bestc.txt') if not os.path.exists(paramfn): continue mlist.append(modelstr) level1_data = generate_level1(mlist, X_train, y) level1fn = 'logreg_level1data_rev{}.dat'.format(utility.logregrev) level1_data.dump(utility.ddir + '/' + level1fn)
def main() : print "Loading data..." X_train = utility.load_encoded('train') y = utility.load_truth() mlist = [] for featfn in os.listdir(utility.ddir) : if not (featfn.startswith('feateng') and featfn.endswith('dat')) : continue modelstr = os.path.splitext(featfn)[0] featfn = utility.ddir + '/' + featfn paramfn = featfn.replace('.dat', '_bestc.txt') if not os.path.exists(paramfn) : continue mlist.append(modelstr) level1_data = generate_level1(mlist, X_train, y) level1fn = 'logreg_level1data_rev{}.dat'.format(utility.logregrev) level1_data.dump(utility.ddir + '/' + level1fn)
#Type of testing to be made: # n-grams = 0 # user mentions = 1 # hashtags = 2 type = 0 #Path to model that we want to test, depending on type model_path = 'models/covid_4epochs_95.0_accuracy_v2.pth' if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') train_data, train_ids = utility.load_and_process(train_dir, type) ground_truth_train = utility.load_truth(train_dir + 'truth.txt') test_data, test_ids = utility.load_and_process(test_dir, type) ground_truth_test = utility.load_truth(test_dir + 'truth.txt') tokenizer = get_tokenizer('spacy', language='en_core_web_sm') vocab_data = train_data + test_data word2idx, max_length = utility.build_vocab(vocab_data, tokenizer) test_data = utility.create_tensors(test_data, word2idx, tokenizer, max_length) test_data_with_labels = utility.append_truth( test_data, ground_truth_test, test_ids)
def main(): y = utility.load_truth() X_train = utility.load_encoded('train') good_features = [ 0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85 ] X_train, keymap = utility.OneHotEncoder(X_train[:, good_features]) ntrain = X_train.shape[0] nb_cvscores = [] lgr_cvscores = [] combined_cvscores = [] cvgen = KFold(ntrain, 10, random_state=utility.SEED) for train_inds, test_inds in cvgen: X_cvtrain = X_train[train_inds] X_cvtest = X_train[test_inds] y_cvtrain = y[train_inds] y_cvtest = y[test_inds] # Fit the Bayesian Classifier mb = MultinomialNB() mb.fit(X_cvtrain, y_cvtrain) mbpred_cvtrain = mb.predict_proba(X_cvtrain)[:, 1] lgr = LogisticRegression() lgr.fit(np.reshape(mbpred_cvtrain, (len(train_inds), 1)), y_cvtrain) # Predict the training data mbpred_cvtest = mb.predict_proba(X_cvtest)[:, 1] mbpred_cvtest = np.reshape(mbpred_cvtest, (len(test_inds), 1)) nb_pred_cvtest = lgr.predict_proba(mbpred_cvtest)[:, 1] # Logistic Regression Only lgrmodel = LogisticRegression() lgrmodel.fit(X_cvtrain, y_cvtrain) lgr_pred_cvtest = lgrmodel.predict_proba(X_cvtest)[:, 1] # Combined combined_pred_cvtest = np.mean(np.vstack( (nb_pred_cvtest, lgr_pred_cvtest)), axis=0) # Recored Scores print nb_cvscore = auc_score(y_cvtest, nb_pred_cvtest) nb_cvscores.append(nb_cvscore) print nb_cvscore lgr_cvscore = auc_score(y_cvtest, lgr_pred_cvtest) lgr_cvscores.append(lgr_cvscore) print lgr_cvscore combined_cvscore = auc_score(y_cvtest, combined_pred_cvtest) combined_cvscores.append(combined_cvscore) print combined_cvscore print np.mean(nb_cvscores) print np.mean(lgr_cvscores) print np.mean(combined_cvscores)
import utility import numpy as np import pandas as pd from sklearn.grid_search import GridSearchCV from sklearn.linear_model import SGDRegressor from sklearn.linear_model import LogisticRegression X = utility.load_encoded('train') y = utility.load_truth() Xtest = utility.load_encoded('test') tuned_parameters = {'loss': ['huber'], 'penalty':['l1'], 'alpha':[1e-8], 'n_iter':[1000], 'p':[0.1]} clf = GridSearchCV(SGDRegressor(verbose=1), tuned_parameters, score_func=utility.eval_auc, cv=3) clf.fit(X, y) for params, avgscore, scores in clf.grid_scores_ : print avgscore, params pred = clf.best_estimator_.predict(X) print pred
import utility import numpy as np import pandas as pd import multiprocessing from sklearn.grid_search import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.ensemble import GradientBoostingRegressor X = utility.load_encoded('train') y = utility.load_truth() Xtest = utility.load_encoded('test') def cross_validate(i): X_cvtrain, X_cvtest, y_cvtrain, y_cvtest = train_test_split(X, y, test_size=0.2, random_state=i) lgr = LogisticRegression(C=2) lgr.fit(X_cvtrain, y_cvtrain) return i, utility.eval_auc(y_cvtest, lgr.predict_proba(X_cvtest)[:, 1]) ncvs = 5 pool = multiprocessing.Pool(5) res = np.zeros(ncvs) for i, auc in pool.imap(cross_validate, range(5)): print "{}: {}".format(i, auc)
def main() : y = utility.load_truth() X_train = utility.load_encoded('train') good_features = [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85] X_train, keymap = utility.OneHotEncoder(X_train[:,good_features]) ntrain = X_train.shape[0] nb_cvscores = [] lgr_cvscores = [] combined_cvscores = [] cvgen = KFold(ntrain, 10, random_state=utility.SEED) for train_inds, test_inds in cvgen : X_cvtrain = X_train[train_inds] X_cvtest = X_train[test_inds] y_cvtrain = y[train_inds] y_cvtest = y[test_inds] # Fit the Bayesian Classifier mb = MultinomialNB() mb.fit(X_cvtrain, y_cvtrain) mbpred_cvtrain = mb.predict_proba(X_cvtrain)[:,1] lgr = LogisticRegression() lgr.fit(np.reshape(mbpred_cvtrain, (len(train_inds), 1)), y_cvtrain) # Predict the training data mbpred_cvtest = mb.predict_proba(X_cvtest)[:,1] mbpred_cvtest = np.reshape(mbpred_cvtest, (len(test_inds), 1)) nb_pred_cvtest = lgr.predict_proba(mbpred_cvtest)[:,1] # Logistic Regression Only lgrmodel = LogisticRegression() lgrmodel.fit(X_cvtrain, y_cvtrain) lgr_pred_cvtest = lgrmodel.predict_proba(X_cvtest)[:,1] # Combined combined_pred_cvtest = np.mean( np.vstack((nb_pred_cvtest, lgr_pred_cvtest)), axis=0) # Recored Scores print nb_cvscore = auc_score(y_cvtest, nb_pred_cvtest) nb_cvscores.append(nb_cvscore) print nb_cvscore lgr_cvscore = auc_score(y_cvtest, lgr_pred_cvtest) lgr_cvscores.append(lgr_cvscore) print lgr_cvscore combined_cvscore = auc_score(y_cvtest, combined_pred_cvtest) combined_cvscores.append(combined_cvscore) print combined_cvscore print np.mean(nb_cvscores) print np.mean(lgr_cvscores) print np.mean(combined_cvscores)
def main(): print "Reading dataset..." X_train_all = utility.load_encoded('train') X_test_all = utility.load_encoded('test') y = utility.load_truth() num_train = X_train_all.shape[0] num_feat = X_train_all.shape[1] print "Loading indexing..." Xts = [ utility.OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_feat) ] print "Setting up the model..." newpredict = lambda (self, X): self.predict_proba(X)[:, 1] model = linear_model.LogisticRegression() model.predict = newpredict # good_features = [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, # 43, 47, 53, 55, 60, 61, 63, 64, 67, 69, # 71, 75, 81, 82, 85, 97, 103, 105, 111, # 112, 114, 125, 127] # model.C = 1.30775906845 good_features = [ 0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85, 97, 103, 105, 108, 115, 122, 141 ] model.C = 1.30775906845 # good_features = [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, # 55, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85] # model.C = 1.485994 # good_features = [ 0, 7, 8, 29, 42, 63, 64, 67, 69, 85] # model.C = 1.09355990876 print "Selected features %s" % good_features print "Getting a CV score..." N = 10 cvgen = cross_validation.ShuffleSplit(num_train, N, 0.2, random_state=25) Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr() cvscores = cross_validation.cross_val_score(model, Xt, y, cv=cvgen, n_jobs=4, scoring='roc_auc') score = cvscores.mean() print "Mean CV score: {}".format(score) print "Performing One Hot Encoding on entire dataset..." Xt = np.vstack((X_train_all[:, good_features], X_test_all[:, good_features])) Xt, keymap = utility.OneHotEncoder(Xt) X_train = Xt[:num_train] X_test = Xt[num_train:] print "Training full model..." model.fit(X_train, y) print "Making prediction and saving results..." preds = model.predict_proba(X_test)[:, 1] submitfn = 'logistic_regression_pred_headstart1.csv' utility.create_test_submission(submitfn, preds)