def main(): Use_scale = True Usecv = True SEED = 15 config = libavito.get_config() cache_loc = config.cache_loc nthreads = config.nthreads Usecv = True # true will split the training data 66-33 and do cv SEED = 15 threads = nthreads # number of workers for parallelism ######### Load files ############ print("Loading input data") train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') y = train['isDuplicate'].values X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values del train print(X.shape) test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') ids = test['id'].values X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values del test print(X_test.shape) metafolder = cache_loc + "meta_folder/" # folder to use to store for meta predictions #remove begatives X[X < 0] = 0 X_test[X_test < 0] = 0 #transform the data with log1p X = np.log1p(X) X_test = np.log1p(X_test) #create meta folder to drop predictions for train and test if not os.path.exists(metafolder): #if it does not exists, we create it os.makedirs(metafolder) outset = "marios_nnnew_v3" # predic of all files #model to use idex1 = [k for k in range(0, (X.shape[0] * 2) / 3)] idex2 = [k for k in range((X.shape[0] * 2) / 3, X.shape[0])] kfolder = [[idex1, idex2]] #Create Arrays for meta train_stacker = [0.0 for k in range(0, len(idex2))] test_stacker = [0.0 for k in range(0, (X_test.shape[0]))] # CHECK EVerything in five..it could be more efficient #create target variable mean_kapa = 0.0 #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED) #number_of_folds=0 #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time i = 0 # iterator counter if Usecv: print("starting cross validation") for train_index, test_index in kfolder: # creaning and validation sets X_train, X_cv = X[train_index], X[test_index] y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] print(" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]), (X_cv.shape[0]), (X_train.shape[1]))) if Use_scale: stda = StandardScaler() X_train = stda.fit_transform(X_train) X_cv = stda.transform(X_cv) preds = bagged_set(X_train, y_train, SEED, 10, X_cv, nval=0.0, verbos=0) # compute Loglikelihood metric for this CV fold #scalepreds(preds) kapa = roc_auc_score(y_cv, preds) print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ( (X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa) mean_kapa += kapa #save the results no = 0 for real_index in test_index: train_stacker[no] = (preds[no]) no += 1 i += 1 if Usecv: print(" Average AUC: %f" % (mean_kapa)) print(" printing train datasets ") printfilcsve(np.array(train_stacker), metafolder + outset + "train.csv") if Use_scale: stda = StandardScaler() X = stda.fit_transform(X) X_test = stda.transform(X_test) #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True) preds = bagged_set(X, y, SEED, 10, X_test, nval=0.0, verbos=0) for pr in range(0, len(preds)): test_stacker[pr] = (preds[pr]) preds = np.array(preds) printfilcsve(np.array(test_stacker), metafolder + outset + "test.csv") print("Write results...") output_file = "submission_" + outset + str((mean_kapa)) + ".csv" print("Writing submission to %s" % output_file) f = open(config.output_loc + output_file, "w") f.write("id,probability\n") # the header for g in range(0, len(preds)): pr = preds[g] f.write("%d,%f\n" % (((ids[g]), pr))) f.close() print("Done.")
def main(): config = libavito.get_config() nthreads = config.nthreads cache_loc = config.cache_loc output_loc = config.output_loc load_data = True SEED = 15 Usecv = True meta_folder = cache_loc + "meta_folder/" # this is how you name the foler that keeps the held-out and test predictions to be used later for meta modelling # least of meta models. All the train held out predictions end with a 'train.csv' notation while all test set predictions with a 'test.csv' meta = [ 'marios_xg_v1', 'marios_nn_v1', 'marios_nnnew_v2', 'marios_sgd_v2', 'marios_logit_v2', 'marios_ridge_v2', 'marios_xgson_v2', 'marios_xgrank_v2', 'marios_xgson_v3', 'marios_nnnew_v3', 'marios_xgrank_v3', 'marios_xgreg_v3', 'marios_nnnew_v4', 'marios_xgson_v4', 'marios_xgsonv2_v5' ] #,'marios_xgsonv2_v5' bags = 5 # helps to avoid overfitting. Istead of 1, we ran 10 models with differnt seed and different shuffling ######### Load files (...or not!) ############ #y = np.loadtxt(meta_folder+"meta_pairs_and_labels.csv", delimiter=',',usecols=[2], skiprows=1) #ids=np.loadtxt("Random_submission.csv", delimiter=',',usecols=[0], skiprows=1) print("Loading input data") train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') y = train['isDuplicate'].values del train test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') ids = test['id'].values del test # the trainstacked features is a dataset provided from Henk and Mathes that contains: #a couple of FTRL models, non alphanumeric , sentiment scores and some additional models if load_data: Xmetatrain = None Xmetatest = None for modelname in meta: mini_xtrain = np.loadtxt( meta_folder + modelname + 'train.csv' ) # we load the held out prediction of the int'train.csv' model mini_xtest = np.loadtxt( meta_folder + modelname + 'test.csv' ) # we load the test set prediction of the int'test.csv' model mean_train = np.mean( mini_xtrain ) # we calclaute the mean of the train set held out predictions for reconciliation purposes mean_test = np.mean( mini_xtest ) # we calclaute the mean of the test set predictions # we print the AUC and the means and we still hope that everything makes sense. Eg. the mean of the train set preds is 1232314.34 and the test is 0.7, there is something wrong... print("model %s auc %f mean train/test %f/%f " % (modelname, roc_auc_score( np.array(y), mini_xtrain), mean_train, mean_test)) if Xmetatrain == None: Xmetatrain = mini_xtrain Xmetatest = mini_xtest else: Xmetatrain = np.column_stack((Xmetatrain, mini_xtrain)) Xmetatest = np.column_stack((Xmetatest, mini_xtest)) # we combine with the stacked features X = Xmetatrain X_test = Xmetatest # we print the pickles printfile(X, meta_folder + "xmetahome.pkl") printfile(X_test, meta_folder + "xtmetahome.pkl") X = load_datas(meta_folder + "xmetahome.pkl") print("rows %d columns %d " % (X.shape[0], X.shape[1])) #X_test=load_datas("onegramtest.pkl") #print("rows %d columns %d " % (X_test.shape[0],X_test.shape[1] )) else: X = load_datas(meta_folder + "xmetahome.pkl") print("rows %d columns %d " % (X.shape[0], X.shape[1])) X_test = load_datas(meta_folder + "xtmetahome.pkl") print("rows %d columns %d " % (X_test.shape[0], X_test.shape[1])) outset = "marios_rf_meta_v1" # Name of the model (quite catchy admitedly) print("len of target=%d" % (len(y)) ) # print the length of the target variable because we can #model we are going to use #ExtraTreesClassifier model = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=9, min_samples_leaf=2, max_features=8, n_jobs=nthreads, random_state=1, verbose=1) #model=LogisticRegression(C=0.01) train_stacker = [0.0 for k in range(0, (X.shape[0])) ] # the object to hold teh held-out preds idex1 = [k for k in range(0, (X.shape[0] * 2) / 3)] # indices for trai idex2 = [k for k in range((X.shape[0] * 2) / 3, X.shape[0]) ] #indices for test kfolder = [[idex1, idex2]] # create an object to put indices in #arrays to save predictions for validation and test for meta modelling (stacking) train_stacker = [0.0 for k in range(0, len(idex2))] test_stacker = [0.0 for k in range(0, (X_test.shape[0]))] #create target variable mean_kapa = 0.0 #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time i = 0 # iterator counter if Usecv: print("starting cross validation") for train_index, test_index in kfolder: # creaning and validation sets X_train, X_cv = X[train_index], X[test_index] y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] print(" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]), (X_cv.shape[0]), (X_train.shape[1]))) #use xgboost bagger preds = bagged_set(X_train, y_train, model, SEED, bags, X_cv, update_seed=True) # compute Loglikelihood metric for this CV fold #scalepreds(preds) kapa = roc_auc_score(y_cv, preds) print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ( (X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa) mean_kapa += kapa #save the results no = 0 for real_index in test_index: train_stacker[no] = (preds[no]) no += 1 i += 1 if (Usecv): #print the array of validation predictions for stacking later on inside the 'meta_folder' print(" Average AUC: %f" % (mean_kapa)) print(" printing train datasets ") printfilcsve(np.array(train_stacker), meta_folder + outset + "train.csv") preds = bagged_set(X, y, model, SEED, bags, X_test, update_seed=True) for pr in range(0, len(preds)): test_stacker[pr] = (preds[pr]) #print prediction as numpy array for stacking later on preds = np.array(preds) printfilcsve(np.array(test_stacker), meta_folder + outset + "test.csv") #create submission file print("Write results...") output_file = "submission_" + outset + str((mean_kapa)) + ".csv" print("Writing submission to %s" % output_file) f = open(output_loc + output_file, "w") f.write("id,probability\n") # the header for g in range(0, len(preds)): pr = preds[g] f.write("%d,%f\n" % (((ids[g]), pr))) f.close() print("Done.")
def main(): config = libavito.get_config() cache_loc = config.cache_loc nthreads = config.nthreads Usecv=True # true will split the training data 66-33 and do cv SEED=15 threads=nthreads # number of workers for parallelism ######### Load files ############ print("Loading input data") train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') y = train['isDuplicate'].values X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values del train print(X.shape) test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') ids = test['id'].values X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values del test print(X_test.shape) metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions if not os.path.exists(metafolder): #if it does not exists, we create it os.makedirs(metafolder) outset="marios_xgson_v2" # predic of all files #model to use model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9, colsample_bytree=0.4,objective='binary:logistic',seed=1) #Create Arrays for meta idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test kfolder=[[idex1,idex2]] # create an object to put indices in #arrays to save predictions for validation and test for meta modelling (stacking) train_stacker=[ 0.0 for k in range (0,(X.shape[0])) ] test_stacker=[0.0 for k in range (0,(X_test.shape[0]))] #create target variable mean_kapa = 0.0 #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time i=0 # iterator counter if Usecv: print ("starting cross validation" ) for train_index, test_index in kfolder: # creaning and validation sets X_train, X_cv = X[train_index], X[test_index] y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) #use xgboost bagger preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True) # compute Loglikelihood metric for this CV fold #scalepreds(preds) kapa = roc_auc_score(y_cv,preds) print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa) mean_kapa += kapa #save the results no=0 for real_index in test_index: train_stacker[no]=(preds[no]) no+=1 i+=1 if (Usecv): #print the array of validation predictions for stacking later on inside the 'meta_folder' print (" Average AUC: %f" % (mean_kapa) ) print (" printing train datasets ") printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv") preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True) for pr in range (0,len(preds)): test_stacker[pr]=(preds[pr]) #print prediction as numpy array for stacking later on preds=np.array(preds) printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv") #create submission file print("Write results...") output_file = "submission_"+ outset +str( (mean_kapa ))+".csv" print("Writing submission to %s" % output_file) f = open(config.output_loc + output_file, "w") f.write("id,probability\n")# the header for g in range(0, len(preds)) : pr=preds[g] f.write("%d,%f\n" % (((ids[g]),pr ) ) ) f.close() print("Done.")
import pandas as pd import numpy as np import libavito as a import feather as f import time cache_loc = a.get_config().cache_loc start = time.time() print('Transforming training data ... ', end='', flush=True) df = f.read_dataframe(cache_loc + 'final_featureSet_train.fthr') df.replace([np.nan, None], -1, inplace=True) df.replace([np.inf, -np.inf], 9999.99, inplace=True) f.write_dataframe(df, cache_loc + 'final_featureSet_train.fthr') del df a.print_elapsed(start) start = time.time() print('Transforming testing data ... ', end='', flush=True) df = f.read_dataframe(cache_loc + 'final_featureSet_test.fthr') df.replace([np.nan, None], -1, inplace=True) df.replace([np.inf, -np.inf], 9999.99, inplace=True) f.write_dataframe(df, cache_loc + 'final_featureSet_test.fthr') a.print_elapsed(start)