Ejemplo n.º 1
0
def run_stack(SEED):

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns.values.tolist()
    columnsHighScore = trainBase.columns.values.tolist()

    print(trainBase.columns)

    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))

    gc.collect()

    avg = 0
    avgLast = -1
    NumFolds = 5

    clf = Lasso(alpha=0.00010)  # found with tune_lasso.py

    print("Data size: " + str(len(trainBase)))
    print("Begin Training")

    lenTrainBase = len(trainBase)

    gc.collect()

    featuresRemaining = []
    avgScore = []

    while True:
        print(clf)
        avg = 0

        coef_dataset = np.zeros((len(columns), NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]
            weightTest = [trainBaseWeight[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))
            #train = np.array(np.reshape(train, (-1, 1))  )
            weight = np.array(np.reshape(weight, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest)

            print(
                str(
                    score.normalized_weighted_gini(targetTest.ravel(),
                                                   predicted.ravel(),
                                                   weightTest.ravel())))
            avg += score.normalized_weighted_gini(
                targetTest.ravel(), predicted.ravel(),
                weightTest.ravel()) / NumFolds

            coef_dataset[:, foldCount] = clf.coef_

            foldCount = foldCount + 1

        coefs = coef_dataset.mean(1)
        sorted_coefs = sorted(
            map(abs,
                coefs))  # must start by removing coefficients closest to zero.
        print(coefs)
        print("len coefs: " + str(len(sorted_coefs)))
        if len(sorted_coefs) < 5:
            break

        threshold = sorted_coefs[5]

        print(str(len(columns)))
        print(trainBase.shape)

        toDrop = []

        # hey, cannot drop var11 and id columns
        for index in range(
                len(coefs) - 1, -1,
                -1):  # must reverse columns all shift to lower numbers.
            if abs(coefs[index]
                   ) <= threshold and columns[index] != "var11" and columns[
                       index] != "id":  # abs(), remove closest to zero.
                print("Drop: " + str(index) + " " + columns[index] + " " +
                      str(coefs[index]))
                #trainBase = np.delete(trainBase,[index], axis=1)
                toDrop.append(index)

                #print(columns)
                if columns[index] in columns:
                    columns.remove(columns[index])
                #print(columns)

        print("start drop")
        trainBase = np.delete(trainBase, toDrop, axis=1)
        print("End drop")

        if avg > avgLast:
            print("Saving Copy " + str(avgLast) + " " + str(avg))
            avgLast = avg
            columnsHighScore = columns.copy()

        print("Threshold: " + str(threshold))
        print("------------------------Average: " + str(avg))
        print(columnsHighScore)
        print(str(len(columns)))
        print(trainBase.shape)

        featuresRemaining.append(len(columns))
        avgScore.append(avg)

        #break

    gc.collect()
    trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv')
    trainBase = trainBase.loc[:, columnsHighScore]
    trainBase.to_csv("../models/" + str(clf)[:5] + "_train.csv", index=False)

    gc.collect()
    test = pd.read_csv('../preprocessdata/pre_departition_test.csv')
    test = test.loc[:, columnsHighScore]
    test.to_csv("../models/" + str(clf)[:5] + "_test.csv", index=False)

    print(columnsHighScore)
    print(featuresRemaining)
    print(avgScore)
Ejemplo n.º 2
0
def run_stack(SEED):

    model = "Lasso"
    lossThreshold = 0.38

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBaseOrig['var11']
    testOrig = pd.read_csv('../models/' + model + '_test.csv')

    targetBase = np.nan_to_num(np.array(trainBaseTarget))

    trainBaseID = trainBaseOrig['id']
    testID = testOrig['id']

    avg = 0
    NumFolds = 5

    stackFiles = []
    for filename in os.listdir("../predictions"):
        parts = filename.split("_")
        if (filename[0:5] == "Stack" and float(parts[2]) > lossThreshold):

            stackFiles.append(filename)

    trainBase = np.zeros((len(trainBaseOrig), len(stackFiles)))
    test = np.zeros((len(testOrig), len(stackFiles)))

    print("Loading Data")
    for fileNum, file in enumerate(stackFiles):
        print(file)
        trn = csv_io.read_data(
            "../predictions/Target_" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(trn):
            trainBase[row, fileNum] = datum[1]  # -1 because we skil

        tst = csv_io.read_data(
            "../predictions/" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(tst):
            test[row, fileNum] = datum[1]

    np.savetxt('temp/dataset_blend_train.txt', trainBase)
    np.savetxt('temp/dataset_blend_test.txt', test)
    print("Num file processed: " + " " + str(len(stackFiles)) + " " +
          "Threshold: " + str(lossThreshold))

    print("Starting Scale")

    allVals = np.vstack((trainBase, test))

    scl = StandardScaler(copy=True, with_mean=True, with_std=True)
    scl.fit(allVals)  # should fit on the combined sets.

    trainBase = scl.transform(trainBase)
    test = scl.transform(test)

    print("Starting Blend")

    clfs = [
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),
        Lasso(alpha=0.000016681005372000593),
        #Ridge(),
        #LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
    ]

    print("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    print("Begin Training")

    lenTrainBase = len(trainBase)
    lenTest = len(test)

    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print(clf)
        avg = 0

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]
            weightTest = [trainBaseWeight[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))
            #train = np.array(np.reshape(train, (-1, 1))  )
            weight = np.array(np.reshape(weight, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest)
            #print(predicted[:,0])
            print(predicted)
            dataset_blend_train[
                test_index,
                ExecutionIndex] = predicted  #[:,0] #needed for Ridge

            #print(targetTest.shape)
            #print(prpredictedob.shape)
            #print(weightTest.shape)

            print(
                str(
                    score.normalized_weighted_gini(targetTest.ravel(),
                                                   predicted.ravel(),
                                                   weightTest.ravel())))
            avg += score.normalized_weighted_gini(
                targetTest.ravel(), predicted.ravel(),
                weightTest.ravel()) / NumFolds
            #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())))
            #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds

            predicted = clf.predict(test)
            dataset_blend_test_set[:, foldCount] = predicted  #[:,0]

            foldCount = foldCount + 1

            #break

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))

        submission = pd.DataFrame(np.zeros((len(testID), 2)),
                                  columns=['id', 'target'])
        submission['target'] = dataset_blend_test[:, ExecutionIndex]
        submission['id'] = testID
        submission.to_csv("../submission/Blend_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + ".csv",
                          index=False)

        #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )

        submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)),
                                  columns=['id', 'target'])
        submission['target'] = dataset_blend_train[:, ExecutionIndex]
        submission['id'] = trainBaseID
        submission.to_csv("../submission/Target_Blend_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + ".csv",
                          index=False)

        csv_io.write_delimited_file("../log/RunLogBlend.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", "Blend", "Stacks: ", stackFiles
        ],
                                    filemode="a",
                                    delimiter=",")

        print("------------------------Average: " + str(avg))

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Ejemplo n.º 3
0
def run_stack(SEED):

    model = "Lasso"


    trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBase['var11']
    test = pd.read_csv('../models/' + model + '_test.csv')


    #trainBase = shuffle(trainBase, random_state = SEED)

    print(trainBase.columns)
    trainBaseID = trainBase['id']
    testID = test['id']    

    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    test = np.nan_to_num(np.array(test))
    
    
    avg = 0
    NumFolds = 5



    #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),    
        #Ridge()
    clfs = [
        LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
        #BaggingRegressor(base_estimator=Ridge(), n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0)
        #AdaBoostRegressor(base_estimator=Ridge(), n_estimators=50, learning_rate=1.0, loss='linear', random_state=None)
        #Lasso(alpha=0.0000329034456231),
        #Ridge(),
        #RandomForestRegressor(n_estimators=3000, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=300, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=1000, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=3000, random_state=166, min_samples_leaf=1),
    ]        
    
    
    
    print ("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))
    


    
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
    lenTest = len(test)
    
    

    gc.collect()
    
    for ExecutionIndex, clf in enumerate(clfs):
        print(clf)
        avg = 0
    

            
        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            print()
            print ("Iteration: " + str(foldCount))
            
            
            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]
            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    
            weightTest = [trainBaseWeight[i] for i in test_index]
            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            weight = np.array(np.reshape(weight, (-1, 1)))              
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
            

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest) 
            #print(predicted[:,0])
            print(test_index)
            dataset_blend_train[test_index, ExecutionIndex] = predicted[:,0] #needed for Ridge

     
            #print(targetTest.shape)
            #print(prpredictedob.shape)
            #print(weightTest.shape)

            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds

            predicted[predicted[:,0] < 0.0] = 0.0        

            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds
       
                 
            predicted = clf.predict(test)         
            dataset_blend_test_set[:, foldCount] = predicted[:,0] 
        

        
                
            foldCount = foldCount + 1
        
   
        
        
        dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
        
    
        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1) 
        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
        
        submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target'])
        submission['target'] = dataset_blend_test[:,ExecutionIndex]
        submission['id'] = testID
        submission.to_csv("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False)
        
        
        #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )        
        
        submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target'])
        submission['target'] = dataset_blend_train[:,ExecutionIndex]
        submission['id'] = trainBaseID
        submission.to_csv("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False)
        
        
        csv_io.write_delimited_file("../log/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", "", "", ""], filemode="a",delimiter=",")
        
        
        print ("------------------------Average: " + str(avg))

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Ejemplo n.º 4
0
def run_stack(SEED):



    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns.values.tolist()
    columnsHighScore = trainBase.columns.values.tolist()


    print(trainBase.columns)
    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))
    
    gc.collect()   
   
    
    avg = 0
    avgLast = -1
    NumFolds = 5 


    clf = Lasso(alpha=0.00010) # found with tune_lasso.py

    
    
    
    print ("Data size: " + str(len(trainBase)))
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
   

    gc.collect()
    
    
    featuresRemaining = []
    avgScore = []    
    
    
    while True:
        print(clf)
        avg = 0
    
        coef_dataset = np.zeros((len(columns),NumFolds))
   
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            print()
            print ("Iteration: " + str(foldCount))
            
            
            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]
            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    
            weightTest = [trainBaseWeight[i] for i in test_index]
            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            weight = np.array(np.reshape(weight, (-1, 1)))              
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
            

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest) 
 
  
            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds

                 
            coef_dataset[:, foldCount] = clf.coef_                 

            foldCount = foldCount + 1
        

     
        
        coefs = coef_dataset.mean(1)        
        sorted_coefs = sorted(map(abs, coefs)) # must start by removing coefficients closest to zero.
        print(coefs)
        print("len coefs: " + str(len(sorted_coefs)))
        if len(sorted_coefs) < 5 :
            break
        
        threshold = sorted_coefs[5]

        print(str(len(columns)))
        print(trainBase.shape)
        
        toDrop = []        
        
        # hey, cannot drop var11 and id columns          
        for index in range(len(coefs) - 1, -1, -1): # must reverse columns all shift to lower numbers.
            if  abs(coefs[index]) <= threshold and columns[index] != "var11" and columns[index] != "id":# abs(), remove closest to zero.
                print("Drop: " + str(index) + " " + columns[index] + " " + str(coefs[index]))
                #trainBase = np.delete(trainBase,[index], axis=1)
                toDrop.append(index)
               
               
                #print(columns)
                if columns[index] in columns: 
                    columns.remove(columns[index])  
                #print(columns)
        
        print("start drop")
        trainBase = np.delete(trainBase,toDrop, axis=1)      
        print("End drop")        
        
        
        if avg > avgLast:
            print("Saving Copy " + str(avgLast) + " " + str(avg))
            avgLast = avg
            columnsHighScore = columns.copy()

        print("Threshold: " + str(threshold))        
        print ("------------------------Average: " + str(avg))
        print(columnsHighScore)
        print(str(len(columns)))
        print(trainBase.shape)
           
           
        featuresRemaining.append(len(columns))           
        avgScore.append(avg)
           
        #break
    
    
               
    gc.collect()    
    trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv')
    trainBase = trainBase.loc[:,columnsHighScore]
    trainBase.to_csv("../models/" + str(clf)[:5] +  "_train.csv", index = False)
    
    
    gc.collect()
    test = pd.read_csv('../preprocessdata/pre_departition_test.csv')
    test = test.loc[:,columnsHighScore]
    test.to_csv("../models/" + str(clf)[:5] + "_test.csv", index = False)  
      
      
    print(columnsHighScore)      
    print(featuresRemaining)
    print(avgScore)
Ejemplo n.º 5
0
def run_stack(SEED):

    trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv')

    trainBase = pd.read_csv('../models/Lasso_train.csv')
    #trainBase = pd.read_csv('../data/pre_shuffled_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns
    columnsHighScore = trainBase.columns

    print(trainBase.columns)

    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))

    gc.collect()

    avg = 0
    avgLast = avg
    bestAvg = avg
    bestAlpha = 0
    NumFolds = 5

    print("Data size: " + str(len(trainBase)))
    print("Begin Training")

    lenTrainBase = len(trainBase)

    gc.collect()

    # best alpha is 0.00069956421567126271
    for a in np.logspace(
            -8, -.5, 30):  # best values seem to be slightly greater than 0.
        #for r in range(1, 10): # r = 0.1 is good
        #for t in np.logspace(2, 6, 10): # t = 0.0001 is good

        clf = ElasticNet(alpha=a,
                         l1_ratio=1 / 10,
                         fit_intercept=True,
                         normalize=False,
                         precompute='auto',
                         max_iter=10000,
                         copy_X=True,
                         tol=1 / 10000,
                         warm_start=False,
                         positive=False)

        print(clf)
        avg = 0

        coef_dataset = np.zeros((len(columns), NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]
            weightTest = [trainBaseWeight[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))
            #train = np.array(np.reshape(train, (-1, 1))  )
            weight = np.array(np.reshape(weight, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest)

            print(
                str(
                    score.normalized_weighted_gini(targetTest.ravel(),
                                                   predicted.ravel(),
                                                   weightTest.ravel())))
            avg += score.normalized_weighted_gini(
                targetTest.ravel(), predicted.ravel(),
                weightTest.ravel()) / NumFolds

            coef_dataset[:, foldCount] = clf.coef_

            foldCount = foldCount + 1

            break

        coefs = coef_dataset.mean(1)
        #print(coefs)
        sorted_coefs = sorted(coefs)
        print("len coefs: " + str(len(sorted_coefs)))

        coefsAboveZero = [i for i in coefs if i > 0.0]
        print(str(len(coefsAboveZero)))

        print("------------------------Average: " + str(avg))

        if avg > bestAvg:
            bestAvg = avg
            #bestAlpha = a

    print("bestAvg: " + str(bestAvg))
    print("bestAlpha: " + str(bestAlpha))
Ejemplo n.º 6
0
def run_stack(SEED):



    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')


    trainBase = pd.read_csv('../models/Lasso_train.csv')
    #trainBase = pd.read_csv('../preprocessdata/pre_shuffled_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns    
    columnsHighScore = trainBase.columns 


    print(trainBase.columns)
    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))
    
    gc.collect()   
   
    
    avg = 0
    avgLast = avg
    bestAvg = avg
    bestAlpha = 0
    NumFolds = 5 


   

    
    
    
    print ("Data size: " + str(len(trainBase)))
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
   

    gc.collect()
    
    # best alpha is 0.00040
    for a in np.logspace(-6, -.5, 30): # best values seem to be slightly greater than 0.
        
        
        
        clf = Lasso(alpha=a)
        print(clf)
        avg = 0
    
        coef_dataset = np.zeros((len(columns),NumFolds))
   
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            print()
            print ("Iteration: " + str(foldCount))
            
            
            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]
            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    
            weightTest = [trainBaseWeight[i] for i in test_index]
            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            weight = np.array(np.reshape(weight, (-1, 1)))              
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
            

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest) 
 
  
            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds

                 
            coef_dataset[:, foldCount] = clf.coef_                 

            foldCount = foldCount + 1
        
            #break
     
        
        coefs = coef_dataset.mean(1)
        print(coefs)        
        sorted_coefs = sorted(coefs)
        print("len coefs: " + str(len(sorted_coefs)))
   
        coefsAboveZero = [i for i in coefs if i > 0.0]   
        print(str(len(coefsAboveZero)))
   
        print ("------------------------Average: " + str(avg))               
  
        if avg > bestAvg:
            bestAvg = avg
            bestAlpha = a
  
  
    print("bestAvg: " + str(bestAvg))
    print("bestAlpha: " + str(bestAlpha))
Ejemplo n.º 7
0
def run_stack(SEED):

    model = "Lasso"
    lossThreshold = 0.38

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBaseOrig['var11']
    testOrig = pd.read_csv('../models/' + model + '_test.csv')


    targetBase = np.nan_to_num(np.array(trainBaseTarget))


    trainBaseID = trainBaseOrig['id']
    testID = testOrig['id']    

    
    avg = 0
    NumFolds = 5


    stackFiles = []
    for filename in os.listdir("../predictions"):
        parts = filename.split("_")
        if ( filename[0:5] == "Stack" and float(parts[2]) > lossThreshold):

            stackFiles.append(filename)
    
    
    trainBase = np.zeros((len(trainBaseOrig), len(stackFiles)))
    test = np.zeros((len(testOrig), len(stackFiles)))
    
    print("Loading Data")
    for fileNum, file in enumerate(stackFiles):
        print(file)
        trn = csv_io.read_data("../predictions/Target_" + file, split="," ,skipFirstLine = True) # skip first because of header.
        for row, datum in enumerate(trn):
            trainBase[row, fileNum] = datum[1] # -1 because we skil 
        
        tst = csv_io.read_data("../predictions/" + file, split="," ,skipFirstLine = True) # skip first because of header.
        for row, datum in enumerate(tst):
            test[row, fileNum] = datum[1]

    np.savetxt('temp/dataset_blend_train.txt', trainBase)
    np.savetxt('temp/dataset_blend_test.txt', test)
    print("Num file processed: " + " " + str(len(stackFiles))  + " " +  "Threshold: " + str(lossThreshold))

    

    print("Starting Scale")

    allVals = np.vstack((trainBase,test))    
    
    scl = StandardScaler(copy=True, with_mean=True, with_std=True)
    scl.fit(allVals) # should fit on the combined sets.
        
    trainBase= scl.transform(trainBase)
    test = scl.transform(test)
      
    
    
    print("Starting Blend")


  
    clfs = [
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),   
        Lasso(alpha=0.000016681005372000593),
        #Ridge(),
        #LinearRegression(fit_intercept=True, normalize=False, copy_X=True)     
    ]        
    
    
    
    print ("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))
    


    
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
    lenTest = len(test)
    
    

    gc.collect()
    
    for ExecutionIndex, clf in enumerate(clfs):
        print(clf)
        avg = 0
    

            
        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            print()
            print ("Iteration: " + str(foldCount))
            
            
            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]
            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    
            weightTest = [trainBaseWeight[i] for i in test_index]
            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            weight = np.array(np.reshape(weight, (-1, 1)))              
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
            

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest) 
            #print(predicted[:,0])
            print(predicted)
            dataset_blend_train[test_index, ExecutionIndex] = predicted#[:,0] #needed for Ridge

     
            #print(targetTest.shape)
            #print(prpredictedob.shape)
            #print(weightTest.shape)

            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds
            #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())))
            #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds

                 
            predicted = clf.predict(test)         
            dataset_blend_test_set[:, foldCount] = predicted#[:,0] 
        
                
            foldCount = foldCount + 1
        
            #break
        
        
        dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
        
    
        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1) 
        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
        
        submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target'])
        submission['target'] = dataset_blend_test[:,ExecutionIndex]
        submission['id'] = testID
        submission.to_csv("../submission/Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False)
        
        
        #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )        
        
        submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target'])
        submission['target'] = dataset_blend_train[:,ExecutionIndex]
        submission['id'] = trainBaseID
        submission.to_csv("../submission/Target_Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False)
        
        
        csv_io.write_delimited_file("../log/RunLogBlend.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", "Blend", "Stacks: ", stackFiles], filemode="a",delimiter=",")
        
        
        print ("------------------------Average: " + str(avg))

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Ejemplo n.º 8
0
def run_pre_single_feature(SEED):



    trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../data/pre_shuffled_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')


    columns = trainBase.columns


    print(trainBase.columns)
    trainBaseID = trainBase['id']

    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    
    
    avg = 0
    NumFolds = 5 


    predicted_list = []
    bootstrapLists = []

      
    
    
    
    #print ("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    

    trainNew = []
    trainTestNew = []
    testNew = []
    trainNewSelect = []
    trainTestNewSelect = []
    testNewSelect = []
    
    
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
    #lenTest = len(test)
    
    
    gc.collect()
    
    columnScores = {}    
    
    columnCount = 0    
    
    clfs = [
        #Ridge(),
        #RandomForestRegressor(n_estimators=30, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=False) ,
        #GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0),
        #AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None),
        SVR(kernel='rbf', degree=3, gamma=0.0, coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, probability=False, cache_size=200, verbose=False, max_iter=-1, random_state=None),
        # floating point over/under flow SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False, rho=None),
        #BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False),
    ]    
    
    
    
    for ExecutionIndex, clf in enumerate(clfs):
    
        print(clf)
        for column in range(0,trainBase.shape[1]):     
    
    
            avg = 0
        
            
            foldCount = 0
    
            Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
                
            for train_index, test_index in Folds:
        
                target = [targetBase[i] for i in train_index]
                train = [trainBase[i,column] for i in train_index]
                weight = [trainBaseWeight[i] for i in train_index]
                
                targetTest = [targetBase[i] for i in test_index]    
                trainTest = [trainBase[i,column] for i in test_index]    
                weightTest = [trainBaseWeight[i] for i in test_index]
                
                #print()
                #print ("Iteration: " + str(foldCount))
                #print "LEN: ", len(train), len(target)
                
                
                target = np.array(np.reshape(target, (-1, 1)) )           
                train = np.array(np.reshape(train, (-1, 1))  ) 
                weight = np.array(np.reshape(weight, (-1, 1)))              
    
                targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
                trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
                weightTest = np.array(np.reshape(weightTest, (-1, 1)))   
                
                #print(target.shape)
              
                
                clf.fit(train, target)
                prob = clf.predict(trainTest) 
          
                #print(targetTest)
                #print(prob)
                #print(weightTest)        
          
                #print(targetTest.shape)
                #print(prob.shape)
                #print(weightTest.shape)  
         
                print(str(score.normalized_weighted_gini(targetTest.ravel(), prob.ravel(), weightTest.ravel())))
                avg += score.normalized_weighted_gini(targetTest.ravel(), prob.ravel(), weightTest.ravel())/NumFolds
                     
                    
                foldCount = foldCount + 1
            
        
            
            print (str(columns[column]) + " Average Score: " + str(avg))
            columnScores[str(columns[column])] = avg
                
         
         
            columnCount = columnCount + 1
            if columnCount > 2:
                break     
         
         
        submission = pd.Series(columnScores)
        submission.to_csv("../featureanalysis/single_feaure_" + str(clf)[:5] + ".csv")   
Ejemplo n.º 9
0
def run_stack(SEED):

    model = "Lasso"
    lossThreshold = 0.3

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBaseOrig['var11']
    testOrig = pd.read_csv('../models/' + model + '_test.csv')

    targetBase = np.nan_to_num(np.array(trainBaseTarget))

    trainBaseID = trainBaseOrig['id']
    testID = testOrig['id']

    avg = 0
    NumFolds = 5

    avgLast = avg
    bestAvg = avg
    bestAlpha = 0

    stackFiles = []
    for filename in os.listdir("../predictions"):
        parts = filename.split("_")
        if (filename[0:5] == "Stack" and float(parts[2]) > lossThreshold):

            stackFiles.append(filename)

    trainBase = np.zeros((len(trainBaseOrig), len(stackFiles)))
    test = np.zeros((len(testOrig), len(stackFiles)))

    print("Loading Data")
    for fileNum, file in enumerate(stackFiles):
        print(file)
        trn = csv_io.read_data(
            "../predictions/Target_" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(trn):
            trainBase[row, fileNum] = datum[1]  # -1 because we skil

        tst = csv_io.read_data(
            "../predictions/" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(tst):
            test[row, fileNum] = datum[1]

    np.savetxt('temp/dataset_blend_train.txt', trainBase)
    np.savetxt('temp/dataset_blend_test.txt', test)
    print("Num file processed: " + " " + str(len(stackFiles)) + " " +
          "Threshold: " + str(lossThreshold))

    print("Starting Scale")

    allVals = np.vstack((trainBase, test))

    scl = StandardScaler(copy=True, with_mean=True, with_std=True)
    scl.fit(allVals)  # should fit on the combined sets.

    trainBase = scl.transform(trainBase)
    test = scl.transform(test)

    print("Starting Blend")

    print("Begin Training")

    lenTrainBase = len(trainBase)
    lenTest = len(test)

    gc.collect()

    for a in np.logspace(
            -6, -.5, 10):  # best values seem to be slightly greater than 0.

        clf = Lasso(alpha=a)
        print(clf)
        avg = 0

        coef_dataset = np.zeros((len(stackFiles), NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]
            weightTest = [trainBaseWeight[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))
            #train = np.array(np.reshape(train, (-1, 1))  )
            weight = np.array(np.reshape(weight, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))

            clf.fit(train, target)
            predicted = clf.predict(trainTest)

            print(
                str(
                    score.normalized_weighted_gini(targetTest.ravel(),
                                                   predicted.ravel(),
                                                   weightTest.ravel())))
            avg += score.normalized_weighted_gini(
                targetTest.ravel(), predicted.ravel(),
                weightTest.ravel()) / NumFolds

            coef_dataset[:, foldCount] = clf.coef_

            foldCount = foldCount + 1

            #break

        coefs = coef_dataset.mean(1)
        print(coefs)
        sorted_coefs = sorted(coefs)
        print("len coefs: " + str(len(sorted_coefs)))

        coefsAboveZero = [i for i in coefs if i > 0.0]
        print(str(len(coefsAboveZero)))

        print("------------------------Average: " + str(avg))

        if avg > bestAvg:
            bestAvg = avg
            bestAlpha = a

    print("bestAvg: " + str(bestAvg))
    print("bestAlpha: " + str(bestAlpha))
Ejemplo n.º 10
0
def run_stack(SEED):

    model = "Lasso"
    lossThreshold = 0.3 

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBaseOrig['var11']
    testOrig = pd.read_csv('../models/' + model + '_test.csv')


    targetBase = np.nan_to_num(np.array(trainBaseTarget))


    trainBaseID = trainBaseOrig['id']
    testID = testOrig['id']    

    
    avg = 0
    NumFolds = 5

    avgLast = avg
    bestAvg = avg
    bestAlpha = 0


    stackFiles = []
    for filename in os.listdir("../predictions"):
        parts = filename.split("_")
        if ( filename[0:5] == "Stack" and float(parts[2]) > lossThreshold):

            stackFiles.append(filename)
    
    
    trainBase = np.zeros((len(trainBaseOrig), len(stackFiles)))
    test = np.zeros((len(testOrig), len(stackFiles)))
    
    print("Loading Data")
    for fileNum, file in enumerate(stackFiles):
        print(file)
        trn = csv_io.read_data("../predictions/Target_" + file, split="," ,skipFirstLine = True) # skip first because of header.
        for row, datum in enumerate(trn):
            trainBase[row, fileNum] = datum[1] # -1 because we skil 
        
        tst = csv_io.read_data("../predictions/" + file, split="," ,skipFirstLine = True) # skip first because of header.
        for row, datum in enumerate(tst):
            test[row, fileNum] = datum[1]

    np.savetxt('temp/dataset_blend_train.txt', trainBase)
    np.savetxt('temp/dataset_blend_test.txt', test)
    print("Num file processed: " + " " + str(len(stackFiles))  + " " +  "Threshold: " + str(lossThreshold))

    
    print("Starting Scale")

    allVals = np.vstack((trainBase,test))    
    
    scl = StandardScaler(copy=True, with_mean=True, with_std=True)
    scl.fit(allVals) # should fit on the combined sets.
        
    trainBase= scl.transform(trainBase)
    test = scl.transform(test)
      
 
    
    
    print("Starting Blend")


    
    
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
    lenTest = len(test)
    
    

    gc.collect()
    
    for a in np.logspace(-6, -.5, 10): # best values seem to be slightly greater than 0.
              
        
        clf = Lasso(alpha=a)
        print(clf)
        avg = 0
    

            
        coef_dataset = np.zeros((len(stackFiles),NumFolds))

        
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            print()
            print ("Iteration: " + str(foldCount))
            
            
            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]
            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    
            weightTest = [trainBaseWeight[i] for i in test_index]
            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            weight = np.array(np.reshape(weight, (-1, 1)))              
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
            

            clf.fit(train, target)
            predicted = clf.predict(trainTest) 

        
            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds
   
                 
            coef_dataset[:, foldCount] = clf.coef_
        
                
            foldCount = foldCount + 1
        
            #break
        
        
        coefs = coef_dataset.mean(1)
        print(coefs)        
        sorted_coefs = sorted(coefs)
        print("len coefs: " + str(len(sorted_coefs)))
   
        coefsAboveZero = [i for i in coefs if i > 0.0]   
        print(str(len(coefsAboveZero)))
   
        print ("------------------------Average: " + str(avg))               
  
        if avg > bestAvg:
            bestAvg = avg
            bestAlpha = a
  
  
    print("bestAvg: " + str(bestAvg))
    print("bestAlpha: " + str(bestAlpha))
Ejemplo n.º 11
0
def run_stack(SEED):

    trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../models/Lasso_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns
    columnsHighScore = trainBase.columns

    print(trainBase.columns)

    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))

    gc.collect()

    avg = 0
    avgLast = avg
    NumFolds = 5

    print("Data size: " + str(len(trainBase)))
    print("Begin Training")

    lenTrainBase = len(trainBase)

    gc.collect()

    CC = [6, 5, 7, 4, 8, 3, 9, 2, 10, 1]
    GG = [-6, -7, -5, -8, -4, -9, -3, -10, -2, -1]

    for c in CC:
        for g in GG:

            clf = SVR(kernel='rbf',
                      degree=3,
                      gamma=10**g,
                      coef0=0.0,
                      tol=0.001,
                      C=10**c,
                      epsilon=0.1,
                      shrinking=True,
                      probability=False,
                      cache_size=200,
                      verbose=False,
                      max_iter=-1,
                      random_state=None)
            print(clf)
            print(str(c) + " " + str(g))
            avg = 0

            coef_dataset = np.zeros((len(columns), NumFolds))

            foldCount = 0

            Folds = cross_validation.KFold(lenTrainBase,
                                           n_folds=NumFolds,
                                           indices=True)

            for train_index, test_index in Folds:

                print()
                print("Iteration: " + str(foldCount))

                now = datetime.datetime.now()
                print(now.strftime("%Y/%m/%d %H:%M:%S"))

                target = [targetBase[i] for i in train_index]
                train = [trainBase[i] for i in train_index]
                weight = [trainBaseWeight[i] for i in train_index]

                targetTest = [targetBase[i] for i in test_index]
                trainTest = [trainBase[i] for i in test_index]
                weightTest = [trainBaseWeight[i] for i in test_index]

                #print "LEN: ", len(train), len(target)

                target = np.array(np.reshape(target, (-1, 1)))
                #train = np.array(np.reshape(train, (-1, 1))  )
                weight = np.array(np.reshape(weight, (-1, 1)))

                targetTest = np.array(np.reshape(targetTest, (-1, 1)))
                #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
                weightTest = np.array(np.reshape(weightTest, (-1, 1)))

                #clf.fit(train, target, sample_weight = weight
                clf.fit(train, target.ravel())
                predicted = clf.predict(trainTest)

                print(
                    str(
                        score.normalized_weighted_gini(targetTest.ravel(),
                                                       predicted.ravel(),
                                                       weightTest.ravel())))
                avg += score.normalized_weighted_gini(
                    targetTest.ravel(), predicted.ravel(),
                    weightTest.ravel()) / NumFolds

                coef_dataset[:, foldCount] = clf.coef_

                foldCount = foldCount + 1

                break

            coefs = coef_dataset.mean(1)
            print(coefs)
            sorted_coefs = sorted(coefs)
            print("len coefs: " + str(len(sorted_coefs)))

            print("------------------------Average: " + str(avg))
Ejemplo n.º 12
0
def run_stack(SEED):

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')

    trainBase = pd.read_csv('../models/Lasso_train.csv')
    #trainBase = pd.read_csv('../preprocessdata/pre_shuffled_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns
    columnsHighScore = trainBase.columns

    print(trainBase.columns)

    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))

    gc.collect()

    avg = 0
    avgLast = avg
    bestAvg = avg
    bestAlpha = 0
    NumFolds = 5

    print("Data size: " + str(len(trainBase)))
    print("Begin Training")

    lenTrainBase = len(trainBase)

    gc.collect()

    # best alpha is 0.00040
    for a in np.logspace(
            -6, -.5, 30):  # best values seem to be slightly greater than 0.

        clf = Lasso(alpha=a)
        print(clf)
        avg = 0

        coef_dataset = np.zeros((len(columns), NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]
            weightTest = [trainBaseWeight[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))
            #train = np.array(np.reshape(train, (-1, 1))  )
            weight = np.array(np.reshape(weight, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest)

            print(
                str(
                    score.normalized_weighted_gini(targetTest.ravel(),
                                                   predicted.ravel(),
                                                   weightTest.ravel())))
            avg += score.normalized_weighted_gini(
                targetTest.ravel(), predicted.ravel(),
                weightTest.ravel()) / NumFolds

            coef_dataset[:, foldCount] = clf.coef_

            foldCount = foldCount + 1

            #break

        coefs = coef_dataset.mean(1)
        print(coefs)
        sorted_coefs = sorted(coefs)
        print("len coefs: " + str(len(sorted_coefs)))

        coefsAboveZero = [i for i in coefs if i > 0.0]
        print(str(len(coefsAboveZero)))

        print("------------------------Average: " + str(avg))

        if avg > bestAvg:
            bestAvg = avg
            bestAlpha = a

    print("bestAvg: " + str(bestAvg))
    print("bestAlpha: " + str(bestAlpha))
Ejemplo n.º 13
0
def run_stack(SEED):



    trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv')


    trainBase = pd.read_csv('../models/Lasso_train.csv')
    #trainBase = pd.read_csv('../data/pre_shuffled_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns    
    columnsHighScore = trainBase.columns 


    print(trainBase.columns)
    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))
    
    gc.collect()   
   
    
    avg = 0
    avgLast = avg
    bestAvg = avg
    bestAlpha = 0
    NumFolds = 5 


   

    
    
    
    print ("Data size: " + str(len(trainBase)))
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
   

    gc.collect()
    
    # best alpha is 0.00069956421567126271
    for a in np.logspace(-8, -.5, 30): # best values seem to be slightly greater than 0.
    #for r in range(1, 10): # r = 0.1 is good
    #for t in np.logspace(2, 6, 10): # t = 0.0001 is good        
        
        
        clf = ElasticNet(alpha=a, l1_ratio=1/10, fit_intercept=True, normalize=False, precompute='auto', max_iter=10000, copy_X=True, tol=1/10000, warm_start=False, positive=False)        

        print(clf)
        avg = 0
    
        coef_dataset = np.zeros((len(columns),NumFolds))
   
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            print()
            print ("Iteration: " + str(foldCount))
            
            
            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]
            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    
            weightTest = [trainBaseWeight[i] for i in test_index]
            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            weight = np.array(np.reshape(weight, (-1, 1)))              
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
            

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest) 
 
  
            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds

                 
            coef_dataset[:, foldCount] = clf.coef_                 

            foldCount = foldCount + 1
        
            break
     
        
        coefs = coef_dataset.mean(1)
        #print(coefs)        
        sorted_coefs = sorted(coefs)
        print("len coefs: " + str(len(sorted_coefs)))
   
        coefsAboveZero = [i for i in coefs if i > 0.0]   
        print(str(len(coefsAboveZero)))
   
        print ("------------------------Average: " + str(avg))               
  
        if avg > bestAvg:
            bestAvg = avg
            #bestAlpha = a
  
  
    print("bestAvg: " + str(bestAvg))
    print("bestAlpha: " + str(bestAlpha))
Ejemplo n.º 14
0
def run_stack(SEED):



    trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../models/Lasso_train.csv')
    trainBaseWeight = trainBase['var11']
    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns    
    columnsHighScore = trainBase.columns 


    print(trainBase.columns)
    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))
    
    gc.collect()   
   
    
    avg = 0
    avgLast = avg
    NumFolds = 5 


   

    
    
    
    print ("Data size: " + str(len(trainBase)))
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
   

    gc.collect()
    
    CC = [6,5,7,4,8,3,9,2,10,1]
    GG = [-6,-7,-5,-8,-4,-9,-3,-10,-2,-1]    
    

    for c in CC: 
        for g in GG:
        
            
            clf = SVR(kernel='rbf', degree=3, gamma=10**g, coef0=0.0, tol=0.001, C=10**c, epsilon=0.1, shrinking=True, probability=False, cache_size=200, verbose=False, max_iter=-1, random_state=None)
            print(clf)
            print(str(c) + " " + str(g))
            avg = 0
        
            coef_dataset = np.zeros((len(columns),NumFolds))
       
            foldCount = 0
    
            Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
                
            for train_index, test_index in Folds:
        
                print()
                print ("Iteration: " + str(foldCount))
                
                
                now = datetime.datetime.now()
                print(now.strftime("%Y/%m/%d %H:%M:%S"))    
        
        
                target = [targetBase[i] for i in train_index]
                train = [trainBase[i] for i in train_index]
                weight = [trainBaseWeight[i] for i in train_index]
                
                targetTest = [targetBase[i] for i in test_index]    
                trainTest = [trainBase[i] for i in test_index]    
                weightTest = [trainBaseWeight[i] for i in test_index]
                
    
                #print "LEN: ", len(train), len(target)
                
                
                target = np.array(np.reshape(target, (-1, 1)) )           
                #train = np.array(np.reshape(train, (-1, 1))  ) 
                weight = np.array(np.reshape(weight, (-1, 1)))              
        
                targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
                #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
                weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
                
    
                #clf.fit(train, target, sample_weight = weight
                clf.fit(train, target.ravel())
                predicted = clf.predict(trainTest) 
     
      
                print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
                avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds
    
                     
                coef_dataset[:, foldCount] = clf.coef_                 
    
                foldCount = foldCount + 1
            
                break
         
            
            coefs = coef_dataset.mean(1)
            print(coefs)        
            sorted_coefs = sorted(coefs)
            print("len coefs: " + str(len(sorted_coefs)))
       
            print ("------------------------Average: " + str(avg))