def main() :

    print "Loading data..."
    y = utility.load_truth()

    print "Loading indexing..."
    Xts = None
    xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir)
    if not os.path.exists(xtsfn) :
        X_train = utility.load_encoded('train')
        nfeat = X_train.shape[1]
        Xts = [utility.OneHotEncoder(X_train[:,[i]])[0] for i in range(nfeat)]
        pickle.dump(Xts, open(xtsfn, "w"))
    else :
        Xts = pickle.load(open(xtsfn))

    mlist = []
    for featfn in os.listdir(utility.ddir) :
        if not (featfn.startswith('feateng') and
                featfn.endswith('dat')) : continue
        modelstr = os.path.splitext(featfn)[0]
        featfn = utility.ddir + '/' + featfn
        paramfn = featfn.replace('.dat', '_bestc.txt')
        if os.path.exists(paramfn) : continue

        print modelstr

        features = np.load(featfn)
        bestC = get_best_hyperparam(features, Xts, y)
        ofile = open(paramfn, 'w')
        ofile.write('{}\n'.format(bestC))
        ofile.close()
Ejemplo n.º 2
0
def main():

    print "Loading data..."
    y = utility.load_truth()

    print "Loading indexing..."
    Xts = None
    xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir)
    if not os.path.exists(xtsfn):
        X_train = utility.load_encoded('train')
        nfeat = X_train.shape[1]
        Xts = [utility.OneHotEncoder(X_train[:, [i]])[0] for i in range(nfeat)]
        pickle.dump(Xts, open(xtsfn, "w"))
    else:
        Xts = pickle.load(open(xtsfn))

    mlist = []
    for featfn in os.listdir(utility.ddir):
        if not (featfn.startswith('feateng') and featfn.endswith('dat')):
            continue
        modelstr = os.path.splitext(featfn)[0]
        featfn = utility.ddir + '/' + featfn
        paramfn = featfn.replace('.dat', '_bestc.txt')
        if os.path.exists(paramfn): continue

        print modelstr

        features = np.load(featfn)
        bestC = get_best_hyperparam(features, Xts, y)
        ofile = open(paramfn, 'w')
        ofile.write('{}\n'.format(bestC))
        ofile.close()
Ejemplo n.º 3
0
def main() :

    print "Loading data..."
    X_train = utility.load_encoded('train')
    X_test = utility.load_encoded('test')
    y = utility.load_truth()

    mlist = []
    for featfn in os.listdir(utility.ddir) :
        if not (featfn.startswith('feateng') and
                featfn.endswith('dat')) : continue
        modelstr = os.path.splitext(featfn)[0]
        featfn = utility.ddir + '/' + featfn
        if not os.path.exists(featfn) : continue
        paramfn = featfn.replace('.dat', '_bestc.txt')
        if not os.path.exists(paramfn) : continue

        mlist.append(modelstr)

    print "Generating level1 test data..."
    X_level1_test = None
    X_level1_testfn = utility.ddir + '/fullmodel_precombined.dat'
    if os.path.exists(X_level1_testfn) :
        X_level1_test = np.load(X_level1_testfn)
    else :
        X_level1_test = generate_level1_test(mlist, X_train, X_test, y)
        X_level1_test.dump(X_level1_testfn)

    print "Writing submissions..."
    weightfn = 'logreg_level1weights_rev{}.dat'.format(utility.logregrev)
    weights = np.load(utility.ddir + '/' + weightfn)
    final_submission = fopt_pred(weights, X_level1_test)
    utility.create_test_submission(
        'logreg_stacked_preds_rev{}.csv'.format(utility.logregrev),
        np.ravel(final_submission))

    print "Getting gbm trained models..."
    gbrfn = '{}/gbr_nest1000.csv'.format(utility.subdir)
    gbmone = np.array(pd.io.parsers.read_csv(gbrfn)['Action'])
    x_level1_test = np.transpose(np.vstack((X_level1_test.T, gbmone)))

    lgwfn = 'logreg_level1weights_linreg_rev{}.dat'.format(utility.logregrev)
    lgweights = np.load(utility.ddir + '/' + lgwfn)
    final_linregsubmission = fopt_pred(lgweights, X_level1_test)
    utility.create_test_submission(
        'logreg_stacked_preds_linreg_rev{}.csv'.format(utility.logregrev),
        np.ravel(final_linregsubmission))

    avgweights = np.ones(len(lgweights)) / float(len(lgweights))
    final_avgsubmission = fopt_pred(avgweights, X_level1_test)
    utility.create_test_submission(
        'logreg_stacked_preds_avg_rev{}.csv'.format(utility.logregrev),
        np.ravel(final_avgsubmission))
def main():

    level1fn = 'logreg_level1data_rev{}.dat'.format(utility.logregrev)
    X_train = np.load(utility.ddir + '/' + level1fn)
    y = np.array(utility.load_truth(), dtype=np.float64)

    weightfn = 'logreg_level1weights_rev{}.dat'.format(utility.logregrev)
    weights = train_level1(X_train, y)
    weights.dump(utility.ddir + '/' + weightfn)

    lgwfn = 'logreg_level1weights_linreg_rev{}.dat'.format(utility.logregrev)
    lgweights = train_level1_linreg(X_train, y)
    lgweights.dump(utility.ddir + '/' + lgwfn)
def main() :

    level1fn = 'logreg_level1data_rev{}.dat'.format(utility.logregrev)
    X_train = np.load(utility.ddir + '/' + level1fn)
    y = np.array(utility.load_truth(), dtype=np.float64)

    weightfn = 'logreg_level1weights_rev{}.dat'.format(utility.logregrev)
    weights = train_level1(X_train, y)
    weights.dump(utility.ddir + '/' + weightfn)

    lgwfn = 'logreg_level1weights_linreg_rev{}.dat'.format(utility.logregrev)
    lgweights = train_level1_linreg(X_train, y)
    lgweights.dump(utility.ddir + '/' + lgwfn)
def main() :

    X_train = utility.load_encoded('train')
    X_test = utility.load_encoded('test')
    y = utility.load_truth()

    xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir)
    if not os.path.exists(xtsfn) :
        Xts = [utility.OneHotEncoder(X_train[:,[i]])[0] for i in range(nfeat)]
        pickle.dump(Xts, open(xtsfn, "w"))
    else :
        Xts = pickle.load(open(xtsfn))

    for iseed in range(6) :
        forward_feature_selection(iseed, X_train, X_test, y, Xts)
Ejemplo n.º 7
0
def main():

    print "Loading data..."
    y = utility.load_truth()

    print "Loading indexing..."
    xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir)
    if not os.path.exists(xtsfn):
        X_train = utility.load_encoded('train')
        nfeat = X_train.shape[1]
        Xts = [utility.OneHotEncoder(X_train[:, [i]])[0] for i in range(nfeat)]
        pickle.dump(Xts, open(xtsfn, "w"))
    else:
        Xts = pickle.load(open(xtsfn))

    for iseed in range(5):
        seedfn = '{}/feateng_forward_seed{}.dat'.format(utility.ddir, iseed)
        if not os.path.exists(seedfn):
            forward_feature_selection(iseed, Xts, y)
def main() :

    print "Loading data..."
    y = utility.load_truth()

    print "Loading indexing..."
    xtsfn = '{}/logreg_xts.pickle'.format(utility.ddir)
    if not os.path.exists(xtsfn) :
        X_train = utility.load_encoded('train')
        nfeat = X_train.shape[1]
        Xts = [utility.OneHotEncoder(X_train[:,[i]])[0] for i in range(nfeat)]
        pickle.dump(Xts, open(xtsfn, "w"))
    else :
        Xts = pickle.load(open(xtsfn))

    for iseed in range(5) :
        seedfn = '{}/feateng_forward_seed{}.dat'.format(utility.ddir, iseed)
        if not os.path.exists(seedfn) :
            forward_feature_selection(iseed, Xts, y)
Ejemplo n.º 9
0
def main():

    print "Loading data..."
    X_train = utility.load_encoded('train')
    y = utility.load_truth()

    mlist = []
    for featfn in os.listdir(utility.ddir):
        if not (featfn.startswith('feateng') and featfn.endswith('dat')):
            continue
        modelstr = os.path.splitext(featfn)[0]
        featfn = utility.ddir + '/' + featfn
        paramfn = featfn.replace('.dat', '_bestc.txt')
        if not os.path.exists(paramfn): continue

        mlist.append(modelstr)

    level1_data = generate_level1(mlist, X_train, y)
    level1fn = 'logreg_level1data_rev{}.dat'.format(utility.logregrev)
    level1_data.dump(utility.ddir + '/' + level1fn)
def main() :

    print "Loading data..."
    X_train = utility.load_encoded('train')
    y = utility.load_truth()

    mlist = []
    for featfn in os.listdir(utility.ddir) :
        if not (featfn.startswith('feateng') and
                featfn.endswith('dat')) : continue
        modelstr = os.path.splitext(featfn)[0]
        featfn = utility.ddir + '/' + featfn
        paramfn = featfn.replace('.dat', '_bestc.txt')
        if not os.path.exists(paramfn) : continue

        mlist.append(modelstr)

    level1_data = generate_level1(mlist, X_train, y)
    level1fn = 'logreg_level1data_rev{}.dat'.format(utility.logregrev)
    level1_data.dump(utility.ddir + '/' + level1fn)
Ejemplo n.º 11
0
#Type of testing to be made:
#   n-grams = 0
#   user mentions = 1
#   hashtags = 2
type = 0

#Path to model that we want to test, depending on type
model_path = 'models/covid_4epochs_95.0_accuracy_v2.pth'

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

train_data, train_ids = utility.load_and_process(train_dir, type)
ground_truth_train = utility.load_truth(train_dir + 'truth.txt')

test_data, test_ids = utility.load_and_process(test_dir, type)
ground_truth_test = utility.load_truth(test_dir + 'truth.txt')

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

vocab_data = train_data + test_data

word2idx, max_length = utility.build_vocab(vocab_data, tokenizer)

test_data = utility.create_tensors(test_data, word2idx, tokenizer, max_length)

test_data_with_labels = utility.append_truth(
    test_data, ground_truth_test, test_ids)
Ejemplo n.º 12
0
def main():

    y = utility.load_truth()
    X_train = utility.load_encoded('train')

    good_features = [
        0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 60, 61, 63,
        64, 67, 69, 71, 75, 81, 82, 85
    ]

    X_train, keymap = utility.OneHotEncoder(X_train[:, good_features])

    ntrain = X_train.shape[0]

    nb_cvscores = []
    lgr_cvscores = []
    combined_cvscores = []
    cvgen = KFold(ntrain, 10, random_state=utility.SEED)
    for train_inds, test_inds in cvgen:

        X_cvtrain = X_train[train_inds]
        X_cvtest = X_train[test_inds]
        y_cvtrain = y[train_inds]
        y_cvtest = y[test_inds]

        # Fit the Bayesian Classifier
        mb = MultinomialNB()
        mb.fit(X_cvtrain, y_cvtrain)
        mbpred_cvtrain = mb.predict_proba(X_cvtrain)[:, 1]

        lgr = LogisticRegression()
        lgr.fit(np.reshape(mbpred_cvtrain, (len(train_inds), 1)), y_cvtrain)

        # Predict the training data
        mbpred_cvtest = mb.predict_proba(X_cvtest)[:, 1]
        mbpred_cvtest = np.reshape(mbpred_cvtest, (len(test_inds), 1))
        nb_pred_cvtest = lgr.predict_proba(mbpred_cvtest)[:, 1]

        # Logistic Regression Only
        lgrmodel = LogisticRegression()
        lgrmodel.fit(X_cvtrain, y_cvtrain)
        lgr_pred_cvtest = lgrmodel.predict_proba(X_cvtest)[:, 1]

        # Combined
        combined_pred_cvtest = np.mean(np.vstack(
            (nb_pred_cvtest, lgr_pred_cvtest)),
                                       axis=0)

        # Recored Scores
        print
        nb_cvscore = auc_score(y_cvtest, nb_pred_cvtest)
        nb_cvscores.append(nb_cvscore)
        print nb_cvscore

        lgr_cvscore = auc_score(y_cvtest, lgr_pred_cvtest)
        lgr_cvscores.append(lgr_cvscore)
        print lgr_cvscore

        combined_cvscore = auc_score(y_cvtest, combined_pred_cvtest)
        combined_cvscores.append(combined_cvscore)
        print combined_cvscore

    print np.mean(nb_cvscores)
    print np.mean(lgr_cvscores)
    print np.mean(combined_cvscores)
import utility
import numpy as np
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression

X = utility.load_encoded('train')
y = utility.load_truth()
Xtest = utility.load_encoded('test')

tuned_parameters = {'loss': ['huber'], 'penalty':['l1'],
                    'alpha':[1e-8], 'n_iter':[1000], 'p':[0.1]}
clf = GridSearchCV(SGDRegressor(verbose=1), tuned_parameters,
                   score_func=utility.eval_auc, cv=3)
clf.fit(X, y)

for params, avgscore, scores in clf.grid_scores_ :
    print avgscore, params

pred = clf.best_estimator_.predict(X)
print pred
Ejemplo n.º 14
0
import utility

import numpy as np
import pandas as pd
import multiprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

X = utility.load_encoded('train')
y = utility.load_truth()
Xtest = utility.load_encoded('test')


def cross_validate(i):
    X_cvtrain, X_cvtest, y_cvtrain, y_cvtest = train_test_split(X,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=i)

    lgr = LogisticRegression(C=2)
    lgr.fit(X_cvtrain, y_cvtrain)
    return i, utility.eval_auc(y_cvtest, lgr.predict_proba(X_cvtest)[:, 1])


ncvs = 5
pool = multiprocessing.Pool(5)
res = np.zeros(ncvs)
for i, auc in pool.imap(cross_validate, range(5)):
    print "{}: {}".format(i, auc)
def main() :

    y = utility.load_truth()
    X_train = utility.load_encoded('train')

    good_features = [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53,
                     55, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85]

    X_train, keymap = utility.OneHotEncoder(X_train[:,good_features])

    ntrain = X_train.shape[0]

    nb_cvscores = []
    lgr_cvscores = []
    combined_cvscores = []
    cvgen = KFold(ntrain, 10, random_state=utility.SEED)
    for train_inds, test_inds in cvgen :

        X_cvtrain = X_train[train_inds]
        X_cvtest = X_train[test_inds]
        y_cvtrain = y[train_inds]
        y_cvtest = y[test_inds]

        # Fit the Bayesian Classifier
        mb = MultinomialNB()
        mb.fit(X_cvtrain, y_cvtrain)
        mbpred_cvtrain = mb.predict_proba(X_cvtrain)[:,1]

        lgr = LogisticRegression()
        lgr.fit(np.reshape(mbpred_cvtrain, (len(train_inds), 1)), y_cvtrain)

        # Predict the training data
        mbpred_cvtest = mb.predict_proba(X_cvtest)[:,1]
        mbpred_cvtest = np.reshape(mbpred_cvtest, (len(test_inds), 1))
        nb_pred_cvtest = lgr.predict_proba(mbpred_cvtest)[:,1]

        # Logistic Regression Only
        lgrmodel = LogisticRegression()
        lgrmodel.fit(X_cvtrain, y_cvtrain)
        lgr_pred_cvtest = lgrmodel.predict_proba(X_cvtest)[:,1]

        # Combined
        combined_pred_cvtest = np.mean(
            np.vstack((nb_pred_cvtest, lgr_pred_cvtest)), axis=0)

        # Recored Scores
        print
        nb_cvscore = auc_score(y_cvtest, nb_pred_cvtest)
        nb_cvscores.append(nb_cvscore)
        print nb_cvscore

        lgr_cvscore = auc_score(y_cvtest, lgr_pred_cvtest)
        lgr_cvscores.append(lgr_cvscore)
        print lgr_cvscore
        
        combined_cvscore = auc_score(y_cvtest, combined_pred_cvtest)
        combined_cvscores.append(combined_cvscore)
        print combined_cvscore

    print np.mean(nb_cvscores)
    print np.mean(lgr_cvscores)
    print np.mean(combined_cvscores)
def main():

    print "Reading dataset..."
    X_train_all = utility.load_encoded('train')
    X_test_all = utility.load_encoded('test')
    y = utility.load_truth()

    num_train = X_train_all.shape[0]
    num_feat = X_train_all.shape[1]

    print "Loading indexing..."
    Xts = [
        utility.OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_feat)
    ]

    print "Setting up the model..."
    newpredict = lambda (self, X): self.predict_proba(X)[:, 1]
    model = linear_model.LogisticRegression()
    model.predict = newpredict

    # good_features = [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42,
    #                  43, 47, 53, 55, 60, 61, 63, 64, 67, 69,
    #                  71, 75, 81, 82, 85, 97, 103, 105, 111,
    #                  112, 114, 125, 127]
    # model.C = 1.30775906845

    good_features = [
        0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 60, 61, 63,
        64, 67, 69, 71, 75, 81, 82, 85, 97, 103, 105, 108, 115, 122, 141
    ]
    model.C = 1.30775906845

    # good_features = [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53,
    #                  55, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85]
    # model.C = 1.485994

    # good_features = [ 0,  7,  8, 29, 42, 63, 64, 67, 69, 85]
    # model.C = 1.09355990876
    print "Selected features %s" % good_features

    print "Getting a CV score..."
    N = 10
    cvgen = cross_validation.ShuffleSplit(num_train, N, 0.2, random_state=25)
    Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
    cvscores = cross_validation.cross_val_score(model,
                                                Xt,
                                                y,
                                                cv=cvgen,
                                                n_jobs=4,
                                                scoring='roc_auc')
    score = cvscores.mean()
    print "Mean CV score: {}".format(score)

    print "Performing One Hot Encoding on entire dataset..."
    Xt = np.vstack((X_train_all[:, good_features], X_test_all[:,
                                                              good_features]))
    Xt, keymap = utility.OneHotEncoder(Xt)
    X_train = Xt[:num_train]
    X_test = Xt[num_train:]

    print "Training full model..."
    model.fit(X_train, y)

    print "Making prediction and saving results..."
    preds = model.predict_proba(X_test)[:, 1]
    submitfn = 'logistic_regression_pred_headstart1.csv'
    utility.create_test_submission(submitfn, preds)