Esempio n. 1
0
def main():

    Use_scale = True
    Usecv = True
    SEED = 15

    config = libavito.get_config()
    cache_loc = config.cache_loc
    nthreads = config.nthreads

    Usecv = True  # true will split the training data 66-33 and do cv
    SEED = 15
    threads = nthreads  # number of workers for parallelism

    ######### Load files ############
    print("Loading input data")
    train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
    y = train['isDuplicate'].values
    X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
    del train
    print(X.shape)
    test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
    ids = test['id'].values
    X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
    del test
    print(X_test.shape)

    metafolder = cache_loc + "meta_folder/"  # folder to use to store for meta predictions
    #remove begatives
    X[X < 0] = 0
    X_test[X_test < 0] = 0
    #transform the data with log1p
    X = np.log1p(X)
    X_test = np.log1p(X_test)

    #create meta folder to drop predictions for train and test
    if not os.path.exists(metafolder):  #if it does not exists, we create it
        os.makedirs(metafolder)

    outset = "marios_nnnew_v3"  # predic of all files

    #model to use

    idex1 = [k for k in range(0, (X.shape[0] * 2) / 3)]
    idex2 = [k for k in range((X.shape[0] * 2) / 3, X.shape[0])]
    kfolder = [[idex1, idex2]]
    #Create Arrays for meta
    train_stacker = [0.0 for k in range(0, len(idex2))]
    test_stacker = [0.0 for k in range(0, (X_test.shape[0]))]
    # CHECK EVerything in five..it could be more efficient

    #create target variable
    mean_kapa = 0.0
    #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED)
    #number_of_folds=0
    #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
    i = 0  # iterator counter
    if Usecv:
        print("starting cross validation")
        for train_index, test_index in kfolder:
            # creaning and validation sets
            X_train, X_cv = X[train_index], X[test_index]
            y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
            print(" train size: %d. test size: %d, cols: %d " %
                  ((X_train.shape[0]), (X_cv.shape[0]), (X_train.shape[1])))

            if Use_scale:
                stda = StandardScaler()
                X_train = stda.fit_transform(X_train)
                X_cv = stda.transform(X_cv)

            preds = bagged_set(X_train,
                               y_train,
                               SEED,
                               10,
                               X_cv,
                               nval=0.0,
                               verbos=0)

            # compute Loglikelihood metric for this CV fold
            #scalepreds(preds)
            kapa = roc_auc_score(y_cv, preds)
            print "size train: %d size cv: %d AUC (fold %d/%d): %f" % (
                (X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)

            mean_kapa += kapa
            #save the results
            no = 0
            for real_index in test_index:
                train_stacker[no] = (preds[no])
                no += 1
            i += 1
        if Usecv:
            print(" Average AUC: %f" % (mean_kapa))
            print(" printing train datasets ")
            printfilcsve(np.array(train_stacker),
                         metafolder + outset + "train.csv")

    if Use_scale:
        stda = StandardScaler()
        X = stda.fit_transform(X)
        X_test = stda.transform(X_test)

    #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True)

    preds = bagged_set(X, y, SEED, 10, X_test, nval=0.0, verbos=0)

    for pr in range(0, len(preds)):
        test_stacker[pr] = (preds[pr])

    preds = np.array(preds)
    printfilcsve(np.array(test_stacker), metafolder + outset + "test.csv")

    print("Write results...")
    output_file = "submission_" + outset + str((mean_kapa)) + ".csv"
    print("Writing submission to %s" % output_file)
    f = open(config.output_loc + output_file, "w")
    f.write("id,probability\n")  # the header
    for g in range(0, len(preds)):
        pr = preds[g]
        f.write("%d,%f\n" % (((ids[g]), pr)))
    f.close()
    print("Done.")
Esempio n. 2
0
def main():
    config = libavito.get_config()
    nthreads = config.nthreads
    cache_loc = config.cache_loc
    output_loc = config.output_loc

    load_data = True
    SEED = 15
    Usecv = True
    meta_folder = cache_loc + "meta_folder/"  # this is how you name the foler that keeps the held-out and test predictions to be used later for meta modelling
    # least of meta models. All the train held out predictions end with a 'train.csv' notation while all test set predictions with a 'test.csv'
    meta = [
        'marios_xg_v1', 'marios_nn_v1', 'marios_nnnew_v2', 'marios_sgd_v2',
        'marios_logit_v2', 'marios_ridge_v2', 'marios_xgson_v2',
        'marios_xgrank_v2', 'marios_xgson_v3', 'marios_nnnew_v3',
        'marios_xgrank_v3', 'marios_xgreg_v3', 'marios_nnnew_v4',
        'marios_xgson_v4', 'marios_xgsonv2_v5'
    ]  #,'marios_xgsonv2_v5'

    bags = 5  # helps to avoid overfitting. Istead of 1, we ran 10 models with differnt seed and different shuffling
    ######### Load files (...or not!) ############

    #y = np.loadtxt(meta_folder+"meta_pairs_and_labels.csv", delimiter=',',usecols=[2], skiprows=1)
    #ids=np.loadtxt("Random_submission.csv", delimiter=',',usecols=[0], skiprows=1)
    print("Loading input data")
    train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
    y = train['isDuplicate'].values
    del train
    test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
    ids = test['id'].values
    del test

    # the trainstacked features is a dataset provided from Henk and Mathes that contains:
    #a couple of FTRL models, non alphanumeric , sentiment scores and some additional models
    if load_data:
        Xmetatrain = None
        Xmetatest = None
        for modelname in meta:
            mini_xtrain = np.loadtxt(
                meta_folder + modelname + 'train.csv'
            )  # we load the held out prediction of the int'train.csv' model
            mini_xtest = np.loadtxt(
                meta_folder + modelname + 'test.csv'
            )  # we load the test set prediction of the int'test.csv' model
            mean_train = np.mean(
                mini_xtrain
            )  # we calclaute the mean of the train set held out predictions for reconciliation purposes
            mean_test = np.mean(
                mini_xtest
            )  # we calclaute the mean of the test set  predictions
            # we print the AUC and the means and we still hope that everything makes sense. Eg. the mean of the train set preds is 1232314.34 and the test is 0.7, there is something wrong...
            print("model %s auc %f mean train/test %f/%f " %
                  (modelname, roc_auc_score(
                      np.array(y), mini_xtrain), mean_train, mean_test))
            if Xmetatrain == None:
                Xmetatrain = mini_xtrain
                Xmetatest = mini_xtest
            else:
                Xmetatrain = np.column_stack((Xmetatrain, mini_xtrain))
                Xmetatest = np.column_stack((Xmetatest, mini_xtest))
        # we combine with the stacked features
        X = Xmetatrain
        X_test = Xmetatest
        # we print the pickles
        printfile(X, meta_folder + "xmetahome.pkl")
        printfile(X_test, meta_folder + "xtmetahome.pkl")

        X = load_datas(meta_folder + "xmetahome.pkl")
        print("rows %d columns %d " % (X.shape[0], X.shape[1]))
        #X_test=load_datas("onegramtest.pkl")
        #print("rows %d columns %d " % (X_test.shape[0],X_test.shape[1] ))
    else:

        X = load_datas(meta_folder + "xmetahome.pkl")
        print("rows %d columns %d " % (X.shape[0], X.shape[1]))
        X_test = load_datas(meta_folder + "xtmetahome.pkl")
        print("rows %d columns %d " % (X_test.shape[0], X_test.shape[1]))

    outset = "marios_rf_meta_v1"  # Name of the model (quite catchy admitedly)

    print("len of target=%d" % (len(y))
          )  # print the length of the target variable because we can

    #model we are going to use
    #ExtraTreesClassifier

    model = RandomForestClassifier(n_estimators=500,
                                   criterion='entropy',
                                   max_depth=9,
                                   min_samples_leaf=2,
                                   max_features=8,
                                   n_jobs=nthreads,
                                   random_state=1,
                                   verbose=1)

    #model=LogisticRegression(C=0.01)
    train_stacker = [0.0 for k in range(0, (X.shape[0]))
                     ]  # the object to hold teh held-out preds

    idex1 = [k for k in range(0, (X.shape[0] * 2) / 3)]  # indices for trai
    idex2 = [k for k in range((X.shape[0] * 2) / 3, X.shape[0])
             ]  #indices for test
    kfolder = [[idex1, idex2]]  # create an object to put indices in

    #arrays to save predictions for validation and test for meta modelling (stacking)
    train_stacker = [0.0 for k in range(0, len(idex2))]
    test_stacker = [0.0 for k in range(0, (X_test.shape[0]))]

    #create target variable
    mean_kapa = 0.0
    #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
    i = 0  # iterator counter
    if Usecv:
        print("starting cross validation")
        for train_index, test_index in kfolder:
            # creaning and validation sets
            X_train, X_cv = X[train_index], X[test_index]
            y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
            print(" train size: %d. test size: %d, cols: %d " %
                  ((X_train.shape[0]), (X_cv.shape[0]), (X_train.shape[1])))

            #use xgboost bagger
            preds = bagged_set(X_train,
                               y_train,
                               model,
                               SEED,
                               bags,
                               X_cv,
                               update_seed=True)

            # compute Loglikelihood metric for this CV fold
            #scalepreds(preds)
            kapa = roc_auc_score(y_cv, preds)
            print "size train: %d size cv: %d AUC (fold %d/%d): %f" % (
                (X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)

            mean_kapa += kapa
            #save the results
            no = 0
            for real_index in test_index:
                train_stacker[no] = (preds[no])
                no += 1
            i += 1
        if (Usecv):
            #print the array of validation predictions for stacking later on inside the 'meta_folder'
            print(" Average AUC: %f" % (mean_kapa))
            print(" printing train datasets ")
            printfilcsve(np.array(train_stacker),
                         meta_folder + outset + "train.csv")

    preds = bagged_set(X, y, model, SEED, bags, X_test, update_seed=True)

    for pr in range(0, len(preds)):
        test_stacker[pr] = (preds[pr])
    #print prediction as numpy array for stacking later on
    preds = np.array(preds)
    printfilcsve(np.array(test_stacker), meta_folder + outset + "test.csv")

    #create submission file
    print("Write results...")
    output_file = "submission_" + outset + str((mean_kapa)) + ".csv"
    print("Writing submission to %s" % output_file)
    f = open(output_loc + output_file, "w")
    f.write("id,probability\n")  # the header
    for g in range(0, len(preds)):
        pr = preds[g]
        f.write("%d,%f\n" % (((ids[g]), pr)))
    f.close()
    print("Done.")
Esempio n. 3
0
def main():
        config = libavito.get_config()
        cache_loc = config.cache_loc
        nthreads = config.nthreads

        Usecv=True # true will split the training data 66-33 and do cv
        SEED=15
        threads=nthreads # number of workers for parallelism

        ######### Load files ############
        print("Loading input data")
        train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
        y = train['isDuplicate'].values
        X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
        del train
        print(X.shape)
        test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
        ids = test['id'].values
        X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
        del test
        print(X_test.shape)


        metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
        if not os.path.exists(metafolder):      #if it does not exists, we create it
            os.makedirs(metafolder)
        outset="marios_xgson_v2" # predic of all files

        #model to use

        model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads,  eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9,
                                   colsample_bytree=0.4,objective='binary:logistic',seed=1)

        #Create Arrays for meta
        idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
        idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]  #indices for test
        kfolder=[[idex1,idex2]] # create an object to put indices in

        #arrays to save predictions for validation and test for meta modelling (stacking)
        train_stacker=[ 0.0  for k in range (0,(X.shape[0])) ]
        test_stacker=[0.0  for k in range (0,(X_test.shape[0]))]

        #create target variable
        mean_kapa = 0.0
        #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
        i=0 # iterator counter
        if Usecv:
            print ("starting cross validation" )
            for train_index, test_index in kfolder:
                # creaning and validation sets
                X_train, X_cv = X[train_index], X[test_index]
                y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
                print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))

                #use xgboost bagger
                preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)

                # compute Loglikelihood metric for this CV fold
                #scalepreds(preds)
                kapa = roc_auc_score(y_cv,preds)
                print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)

                mean_kapa += kapa
                #save the results
                no=0
                for real_index in test_index:
                         train_stacker[no]=(preds[no])
                         no+=1
                i+=1
            if (Usecv):
                #print the array of validation predictions for stacking later on inside the 'meta_folder'
                print (" Average AUC: %f" % (mean_kapa) )
                print (" printing train datasets ")
                printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")

        preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)


        for pr in range (0,len(preds)):
                    test_stacker[pr]=(preds[pr])
        #print prediction as numpy array for stacking later on
        preds=np.array(preds)
        printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")

        #create submission file
        print("Write results...")
        output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
        print("Writing submission to %s" % output_file)
        f = open(config.output_loc + output_file, "w")
        f.write("id,probability\n")# the header
        for g in range(0, len(preds))  :
            pr=preds[g]
            f.write("%d,%f\n" % (((ids[g]),pr ) ) )
        f.close()
        print("Done.")
import pandas as pd
import numpy as np
import libavito as a
import feather as f
import time

cache_loc = a.get_config().cache_loc

start = time.time()
print('Transforming training data ... ', end='', flush=True)
df = f.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
df.replace([np.nan, None], -1, inplace=True)
df.replace([np.inf, -np.inf], 9999.99, inplace=True)
f.write_dataframe(df, cache_loc + 'final_featureSet_train.fthr')
del df
a.print_elapsed(start)

start = time.time()
print('Transforming testing data ... ', end='', flush=True)
df = f.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
df.replace([np.nan, None], -1, inplace=True)
df.replace([np.inf, -np.inf], 9999.99, inplace=True)
f.write_dataframe(df, cache_loc + 'final_featureSet_test.fthr')
a.print_elapsed(start)