コード例 #1
0
ファイル: test_xgboost.py プロジェクト: jbRegli/Higgs
# Concatenate data:
IDTest = preTreatment.concatenate_vectors(test_RM_s[0])
IDTest = IDTest.astype(np.int64)

# Rank the prediction:
predProbaRank_Test = np.zeros(predProba_Test.shape[0])
if len(predProba_Test.shape) == 2:
    if predProba_Test.shape[1] == 5:
        predProbaRank_Test[:] = np.max(predProba_Test[:,1:4],axis = 1)
    else:
        print "Error!!!!!"
else:
    predProbaRank_Test = predProba_Test

yTestProbaRanked = submission.rank_signals(predProbaRank_Test)


# Create the submission file:
submission_name = "submssion_xgb_5c_SPT_r120_ams" + str(best_ams)[0:6]

print ("Generating a submsission file named %s" %submission_name)

sub = submission.print_submission(IDTest, yTestProbaRanked, predLabel_Test,
                                  submission_name)

# Finish...
print "finish!!!"



コード例 #2
0
ファイル: main.py プロジェクト: jbRegli/Higgs
def main():
    ###############
    ### IMPORT ####
    ###############
    # Importation parameters:
    split= True
    normalize = True
    noise_var = 0.
    ratio_train = 0.9

    # Import the training data:
    print("Extracting the data sets...")
    start = time.clock()
    train_s, valid_s, test_s = tokenizer.extract_data(split = split,
                                                      normalize = normalize,
                                                      noise_variance = 0.,
                                                      #n_classes = "multiclass",
                                                      n_classes = "binary",
                                                      train_size = 200000,
                                                      train_size2 = 0,
                                                      valid_size = 50000)

    stop = time.clock()
    print ("Extraction time: %i s") %(stop-start)

    print train_s[4]

    print(" ")
    print(" ")

    ######################
    ### PRE-TREATMENT ####
    ######################
    print("------------------------- Pre-treatment --------------------------")
    ### Average number of signal per subset:
    print("Train subsets signal average:")
    train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2])
    print(" ")
    print("Valid subsets signal average:")
    valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2])

    print(" ")
    print(" ")

    ############
    # ANALYSES #
    ############

    # Dictionnary that will contain all the data for each methods. In the end
    # we'll have a dict of dict
    # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost,
    #                       randomForest}
    dMethods ={}

    # NAIVE BAYES:
    kwargs_bayes = {}
    dMethods['naiveBayes'] =  analyse.analyse(train_s, valid_s, 'naiveBayes',
                                              kwargs_bayes)
    # SVM
    """
    kwargs_svm ={}
    dMethods['svm'] = analyse.analyse(train_s, valid_s,'svm', kwargs_svm)
    """
    # K NEIGHBORS
    kwargs_kn = {'n_neighbors':50}
    dMethods['kNeighbors'] = analyse.analyse(train_s, valid_s, 'kNeighbors',
                                             kwargs_kn)

    # LDA
    kwargs_lda = {}
    dMethods['lda'] = analyse.analyse(train_s, valid_s, 'lda', kwargs_lda)
    # QDA
    kwargs_qda= {}
    dMethods['qda'] = analyse.analyse(train_s, valid_s, 'qda', kwargs_qda)

    # ADABOOST
    kwargs_ada= {   'base_estimators': None,
                    'n_estimators': 50,
                    'learning_rate': 1.,
                    'algorithm': 'SAMME.R',
                    'random_state':None}
    dMethods['adaBoost'] = analyse.analyse(train_s, valid_s, 'adaBoost',
                                           kwargs_ada)

    # RANDOM FOREST:
    kwargs_rdf= {'n_trees': 10}
    dMethods['randomForest'] = analyse.analyse(train_s, valid_s, 'randomForest',
                                               kwargs_rdf)

    # RANDOM FOREST 2:
    kwargs_rdf= {'n_trees': 100}
    dMethods['randomForest2'] = analyse.analyse(train_s, valid_s, 'randomForest',
                                               kwargs_rdf)
    # ADABOOST2
    kwargs_ada= {   'base_estimators': None,
                    'n_estimators': 100,
                    'learning_rate': .5,
                    'algorithm': 'SAMME.R',
                    'random_state':None}
    dMethods['adaBoost2'] = analyse.analyse(train_s, valid_s, 'adaBoost',
                                           kwargs_ada)


    print(" ")

    ##################
    # POST-TREATMENT #
    ##################
    print("------------------------ Merged predictor -----------------------")

    #ignore = ['randomForest2', 'randomForest']
    ignore = []

    final_prediction_s, dSl = onTopClassifier.SL_classification(dMethods, valid_s,
                                        train_s, method='svm', ignore = ignore)


    # Transform the probabilities in rank:
    #final_pred = postTreatment.rank_signals(final_pred)

    # Trunk the vectors

    for method in dMethods:
        yProba_s = dMethods[str(method)]['yProba_s']
        yPredicted_s = dMethods[str(method)]['yPredicted_s']

        yPredicted_treshold_s = postTreatment.proba_treshold(yPredicted_s, yProba_s, 0.5)

            # Numerical score:
        if type(yPredicted_s) == list:
            for i in range(len(yPredicted_s)):
                sum_s, sum_b = submission.get_numerical_score(yPredicted_s[i],
                                                          valid_s[2][i])
                print "Subset %i: %i elements - sum_s[%i] = %i - sum_b[%i] = %i" \
                        %(i, yPredicted_s[i].shape[0], i, sum_s, i, sum_b)

        # Get s and b for each group (s_s, b_s) and the final final_s and
        # final_b:
        final_s, final_b, s_s, b_s = submission.get_s_b_8(yPredicted_s, valid_s[2],
                                                  valid_s[3])

        # Balance the s and b
        final_s *= 250000/25000
        final_b *= 250000/25000
        # AMS final:
        AMS = hbc.AMS(final_s , final_b)
        print ("Expected AMS score for randomforest : %f") %AMS
        #AMS by group
        AMS_s = []
        for i, (s,b) in enumerate(zip(s_s, b_s)):
            s *= 250000/yPredicted_s[i].shape[0]
            b *= 250000/yPredicted_s[i].shape[0]
            score = hbc.AMS(s,b)
            AMS_s.append(score)
            print("Expected AMS score for randomforest :  for group %i is : %f" %(i, score))
        print(" ")


    ##############
    # SUBMISSION #
    ##############
    print("-------------------------- Submission ---------------------------")

    # Prediction on the test set:
    # method used for the submission
    # TODO : Verifier que le nom de la method a bien la bonne forme(
    # creer une liste de noms de methodes)

    #method = "randomForest"

    #test_prediction_s, test_proba_s = eval(method).get_test_prediction(
    #                                            dMethods[method]['predictor_s'],
    #                                            test_s[1])

    test_prediction_s, test_proba_s = onTopClassifier.get_SL_test_prediction(
                                                dMethods, dSl, test_s[1])


    print("Test subsets signal average:")
    test_s_average = preTreatment.ratio_sig_per_dataset(test_prediction_s)
    print(" ")

    #RankOrder = np.arange(1,550001)

    if type(test_prediction_s) == list:
        test_prediction_s = np.concatenate(test_prediction_s)
        test_proba_s = np.concatenate(test_proba_s)
        RankOrder = onTopClassifier.rank_signals(test_proba_s)
        ID = np.concatenate(test_s[0])
    else:
        ID = test_s[0]

    # Create a submission file:
    sub = submission.print_submission(ID, RankOrder , test_prediction_s)

    return sub
コード例 #3
0
ファイル: maintest.py プロジェクト: jbRegli/Higgs
def main():
    ###############
    ### IMPORT ####
    ###############
    # Importation parameters:
    split= True
    normalize = True
    noise_var = 0.
    ratio_train = 0.9

    # Import the training data:
    print("Extracting the data sets...")
    start = time.clock()
    train_s, valid_s, test_s = tokenizer.extract_data(split= split,
                                                      normalize= normalize,
                                                      noise_variance= noise_var,
                                                      ratio_train= ratio_train)
    stop = time.clock()
    print ("Extraction time: %i s") %(stop-start)

    print(" ")
    print(" ")

    ######################
    ### PRE-TREATMENT ####
    ######################
    print("------------------------- Pre-treatment --------------------------")
    ### Average number of signal per subset:
    print("Train subsets signal average:")
    train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2])
    print(" ")
    print("Valid subsets signal average:")
    valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2])

    print(" ")
    print(" ")

    ############
    # ANALYSES #
    ############

    # Dictionnary that will contain all the data for each methods. In the end
    # we'll have a dict of dict
    # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost,
    #                       randomForest}
    dMethods ={}
    # RANDOM FOREST:
    kwargs_rdf= {'n_trees': 50}
    dMethods['randomForest'] = analyse.analyse(train_s, valid_s, 'randomForest',
                                               kwargs_rdf)

    print(" ")

    ##################
    # POST-TREATMENT #
    ##################
    print("post treatment")
    yProba_s = dMethods['randomForest']['yProba_s']
    yPredicted_s = dMethods['randomForest']['yPredicted_s']

    for n in range(8):
        L = []
        for i in range(yPredicted_s[n].shape[0]):
            if yPredicted_s[n][i] == 1:
                L.append(yProba_s[n][i][1])

        L.sort(reverse = True)
        prob_limit = L[int(len(L)*0.45)]


        for i in range(yPredicted_s[n].shape[0]):
            if yProba_s[n][i][1] < prob_limit:
                yPredicted_s[n][i] = 0
            else:
                yPredicted_s[n][i] = 1

    # Numerical score:
    if type(yPredicted_s) == list:
        for i in range(len(yPredicted_s)):
            sum_s, sum_b = submission.get_numerical_score(yPredicted_s[i],
                                                          valid_s[2][i])
            print "Subset %i: %i elements - sum_s[%i] = %i - sum_b[%i] = %i" \
                    %(i, yPredicted_s[i].shape[0], i, sum_s, i, sum_b)
    
    # Get s and b for each group (s_s, b_s) and the final final_s and
    # final_b:
    final_s, final_b, s_s, b_s = submission.get_s_b_8(yPredicted_s, valid_s[2],
                                                  valid_s[3])

    # Balance the s and b
    final_s *= 250000/25000
    final_b *= 250000/25000
    # AMS final:
    AMS = hbc.AMS(final_s , final_b)
    print ("Expected AMS score for randomforest : %f") %AMS
    #AMS by group
    AMS_s = []
    for i, (s,b) in enumerate(zip(s_s, b_s)):
        s *= 250000/yPredicted_s[i].shape[0]
        b *= 250000/yPredicted_s[i].shape[0]
        score = hbc.AMS(s,b)
        AMS_s.append(score)
        print("Expected AMS score for randomforest :  for group %i is : %f" %(i, score))
    print(" ")

    
    ##############
    # SUBMISSION #
    ##############
    print("-------------------------- Submission ---------------------------")

    # Prediction on the test set:
    # method used for the submission
    # TODO : Verifier que le nom de la method a bien la bonne forme(
    # creer une liste de noms de methodes)

    #method = "randomForest"

    #test_prediction_s, test_proba_s = eval(method).get_test_prediction(
    #                                            dMethods[method]['predictor_s'],
    #                                            test_s[1])

    test_prediction_s, test_proba_s = postTreatment.get_SL_test_prediction(
                                                dMethods, dSl, test_s[1])


    print("Test subsets signal average:")
    test_s_average = preTreatment.ratio_sig_per_dataset(test_prediction_s)
    print(" ")

    #RankOrder = np.arange(1,550001)

    if type(test_prediction_s) == list:
        test_prediction_s = np.concatenate(test_prediction_s)
        test_proba_s = np.concatenate(test_proba_s)
        RankOrder = postTreatment.rank_signals(test_proba_s)
        ID = np.concatenate(test_s[0])
    else:
        ID = test_s[0]

    # Create a submission file:
    sub = submission.print_submission(ID, RankOrder , test_prediction_s)

    return sub