Ejemplo n.º 1
0
def best_ratio_combinaison(yProba_s, yValidation_s, weightsValidation_s, ratio_s):
    """
    returns the best ratio combinaison with the ratios specified in ratio_s for each
    group
    ratio_s : List of the list of the ratios to test for each group
    the size of each list should not exceed 4 for computationnal time issues
    """
    best_ratio_comb = [0.,0.,0.,0.,0.,0.,0.,0.]
    AMS_max = 0.
    """
    ratio_1_s = [0.06, 0.08,0.10,0.12]
    ratio_2_s = [0.15,0.16,0.17,0.18]
    ratio_3_s = [0.36,0.38,0.40,0.42]
    ratio_4_s = [0.16,0.18,0.2,0.22]
    ratio_5_s = [0.007,0.008,0.009,0.01]
    ratio_6_s = [0.003,0.004,0.005,0.006]
    ratio_7_s = [0.003,0.004,0.005,0.006]
    ratio_8_s = [0.007,0.008,0.009,0.01]
    """
    g_combinaisons = itertools.product(ratio_s[0], ratio_s[1],
                                       ratio_s[2], ratio_s[3],
                                       ratio_s[4], ratio_s[5],
                                       ratio_s[6], ratio_s[7])

    # if we work with multi-class:
    if len(yProba_s[0].shape) == 2:
            if yProba_s[0].shape[1] == 5:
                for i,subset in enumerate(yProba_s):
                    yProba_s[i] =  preTreatment.multiclass2binary(subset)

    compteur = 0

    for combinaison in g_combinaisons:
        #if compteur%10000==0:
            # print "number of iterations : %i" %compteur
        compteur +=1

        L = list(combinaison)

        yPredicted_s, yPredicted_conca = get_yPredicted_ratio_8(yProba_s, L)

        finals, finalb, s_s, b_s = submission.get_s_b(yPredicted_s,
                                                      yValidation_s,
                                                      weightsValidation_s)

        AMS = hbc.AMS(finals, finalb)
        if AMS > AMS_max:
            AMS_max = AMS
            best_ratio_comb = L

    return AMS_max, best_ratio_comb
Ejemplo n.º 2
0
        print "Train2 - best ratio : %s - best ams : %f" \
                %(', '.join(map(str,best_ratio)), best_ams_train2)
        print(" ")


        print "Making predictions on the validation set..."
        # Prediction of the validation set 2:
        predProba_Valid2_s = xgBoost.predict_proba(predictor_s, valid_RM_s_2[1])

        # Thresholding the predictions:
        predProba_Valid2 = preTreatment.concatenate_vectors(predProba_Valid2_s)
        predLabel5_Valid2 = tresholding.get_yPredicted_ratio(predProba_Valid2,
                                                             best_ratio)

        # Binarize the prediction:
        predLabel_Valid2 = preTreatment.multiclass2binary(predLabel5_Valid2)

        # Concatenate data:
        yValid2 = preTreatment.concatenate_vectors(valid_RM_s_2[2])
        weightsValidation = preTreatment.concatenate_vectors(valid_RM_s_2[3])

        # Estimation the AMS:
        s, b = submission.get_s_b(predLabel_Valid2, yValid2, weightsValidation)
        s *= 250000/predLabel_Valid2.shape[0]
        b *= 250000/predLabel_Valid2.shape[0]
        ams = hbc.AMS(s,b)

        print "Valid_RM_2 - ratio : %f - best ams : %f" %(best_ratio, ams)
        print(" ")

        # Saving the model if it's better:
Ejemplo n.º 3
0
def train(max_depth, n_rounds):

    ###############
    ### IMPORT ####
    ###############
    # Importation parameters:
    split= True
    normalize = True
    noise_var = 0.
    train_size = 200000
    train_size2 = 25000
    valid_size = 25000
    remove_999 = False

    # Import the training data:
    print("Extracting the data sets...")
    start = time.clock()
    train_s, train2_s, valid_s,  test_s = tokenizer.extract_data(split= split, \
                                             normalize= normalize, \
                                             remove_999 = remove_999, \
                                             noise_variance= noise_var, \
                                             n_classes = "multiclass", \
                                             train_size = train_size, \
                                             train_size2 = train_size2, \
                                             valid_size = valid_size)

    
    #RANDOM FOREST:
    #kwargs_grad = {}
    #kwargs_rdf = {'n_estimators': 100}
    print "Training on the train set ..."
    #predictor_s = randomForest.train_classifier(train_s[1], train_s[2], kwargs_rdf)

    #XGBOOST
    kwargs_xgb = {'bst_parameters': \
                {'booster_type': 0, \
                     #'objective': 'binary:logitraw',
                     'objective': 'multi:softprob', 'num_class': 5,
                     'bst:eta': 0.1, # the bigger the more conservative
                     'bst:subsample': 1, # prevent over fitting if <1
                     'bst:max_depth': max_depth, 'eval_metric': 'auc', 'silent': 1,
                     'nthread': 8 }, \
                'n_rounds': n_rounds}

    predictor_s = xgBoost.train_classifier(train_s[1], train_s[2], train_s[3], 550000, kwargs_xgb)
    
    #TEST / SUBMISSION
    """
    yProbaTest_s = []
    yProbaTestBinary_s = []

    print "Classifying the test set..."
    for i in range(8):
        yProbaTest = xgBoost.predict_proba(predictor_s[i], test_s[1][i])
        yProbaTest_s.append(yProbaTest)
    print "Making the binary proba vector..."
    for i in range(8):
        yProbaTestBinary_s.append(np.zeros(yProbaTest_s[i].shape[0]))
    for i in range(8):
        for j in range(yProbaTest_s[i].shape[0]):
            yProbaTestBinary_s[i][j] = 1 - yProbaTest_s[i][j][0]

    print "Concatenating the vectors..."
    yProbaTestBinary = preTreatment.concatenate_vectors(yProbaTestBinary_s)
    IDs = preTreatment.concatenate_vectors(test_s[0])


    yProbaTestBinaryRanked = submission.rank_signals(yProbaTestBinary)
    
    yPredictedTest = tresholding.get_yPredicted_ratio(yProbaTestBinary, 0.15)

    s = submission.print_submission(IDs, yProbaTestBinaryRanked, yPredictedTest, "newAMSmesure") 

    
    """
    # TRAIN AND VALID
    
    yPredictedTrain2_s = []
    yProbaTrain2_s = []
    yProbaTrain2Binary_s = []
    yPredictedValid_s = []
    yProbaValid_s = []
    yProbaValidBinary_s = []

    print "Classifying the train2 set..."
    for i in range(8):
        yProbaTrain2 = xgBoost.predict_proba(predictor_s[i], train2_s[1][i])
        yProbaTrain2_s.append(yProbaTrain2)
    print "Classifying the valid set..."
    for i in range(8):
        yProbaValid = xgBoost.predict_proba(predictor_s[i], valid_s[1][i])
        yProbaValid_s.append(yProbaValid)

    print "Making the binary proba vector..."
    for i in range(8):
        yProbaTrain2Binary_s.append(np.zeros(yProbaTrain2_s[i].shape[0]))
        yProbaValidBinary_s.append(np.zeros(yProbaValid_s[i].shape[0]))
    for i in range(8):
        for j in range(yProbaTrain2_s[i].shape[0]):
            yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0]
        for j in range(yProbaValid_s[i].shape[0]):
            yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0]

    print "Concatenating the vectors..."
    yProbaTrain2Binary = preTreatment.concatenate_vectors(yProbaTrain2Binary_s)
    yProbaValidBinary = preTreatment.concatenate_vectors(yProbaValidBinary_s)
    yTrain2 = preTreatment.concatenate_vectors(train2_s[2])
    yValid = preTreatment.concatenate_vectors(valid_s[2])
    weightsTrain2 = preTreatment.concatenate_vectors(train2_s[3])
    weightsValid = preTreatment.concatenate_vectors(valid_s[3])

    print "Putting all the real labels to 1"
    yTrain2 = preTreatment.multiclass2binary(yTrain2)
    yValid = preTreatment.multiclass2binary(yValid)

    print "Getting the best ratios..."
    best_ams_train2_global, best_ratio_global = tresholding.best_ratio(yProbaTrain2Binary, yTrain2, weightsTrain2)
    #best_ams_train2_combinaison, best_ratio_combinaison = tresholding.best_ratio_combinaison_global(yProbaTrain2Binary_s, train2_s[2], train2_s[3], 1)

    yPredictedValid = tresholding.get_yPredicted_ratio(yProbaValidBinary, 0.15)
    yPredictedValid_best_ratio_global = tresholding.get_yPredicted_ratio(yProbaValidBinary, best_ratio_global)
    #yPredictedValid_best_ratio_combinaison_s, yPredictedValid_best_ratio_combinaison = tresholding.get_yPredicted_ratio_8(yProbaTrain2Binary_s, best_ratio_combinaison)

    #Let's compute the predicted AMS
    s, b = submission.get_s_b(yPredictedValid, yValid, weightsValid)
    AMS = hbc.AMS(s,b)
    #s_best_ratio_combinaison, b_best_ratio_combinaison = submission.get_s_b(yPredictedValid_best_ratio_combinaison, yValid, weightsValid)
    #AMS_best_ratio_combinaison = hbc.AMS(s_best_ratio_combinaison, b_best_ratio_combinaison)
    s_best_ratio_global, b_best_ratio_global = submission.get_s_b(yPredictedValid_best_ratio_global, yValid, weightsValid)
    AMS_best_ratio_global = hbc.AMS(s_best_ratio_global, b_best_ratio_global)

    print "AMS 0.15 = %f" %AMS
    print " "
    #print "AMS best ratio combi= %f" %AMS_best_ratio_combinaison
    #print "best AMS train2 ratio combinaison= %f" %best_ams_train2_combinaison
    #print "best ratio combinaison train 2 = %s" %str(best_ratio_combinaison)
    print " "
    print "best AMS valid ratio global= %f" %AMS_best_ratio_global
    print "best AMS train2 ratio global= %f" %best_ams_train2_global
    print "best ratio global train2 = %f" %best_ratio_global
 

    return AMS