def best_ratio_combinaison(yProba_s, yValidation_s, weightsValidation_s, ratio_s): """ returns the best ratio combinaison with the ratios specified in ratio_s for each group ratio_s : List of the list of the ratios to test for each group the size of each list should not exceed 4 for computationnal time issues """ best_ratio_comb = [0.,0.,0.,0.,0.,0.,0.,0.] AMS_max = 0. """ ratio_1_s = [0.06, 0.08,0.10,0.12] ratio_2_s = [0.15,0.16,0.17,0.18] ratio_3_s = [0.36,0.38,0.40,0.42] ratio_4_s = [0.16,0.18,0.2,0.22] ratio_5_s = [0.007,0.008,0.009,0.01] ratio_6_s = [0.003,0.004,0.005,0.006] ratio_7_s = [0.003,0.004,0.005,0.006] ratio_8_s = [0.007,0.008,0.009,0.01] """ g_combinaisons = itertools.product(ratio_s[0], ratio_s[1], ratio_s[2], ratio_s[3], ratio_s[4], ratio_s[5], ratio_s[6], ratio_s[7]) # if we work with multi-class: if len(yProba_s[0].shape) == 2: if yProba_s[0].shape[1] == 5: for i,subset in enumerate(yProba_s): yProba_s[i] = preTreatment.multiclass2binary(subset) compteur = 0 for combinaison in g_combinaisons: #if compteur%10000==0: # print "number of iterations : %i" %compteur compteur +=1 L = list(combinaison) yPredicted_s, yPredicted_conca = get_yPredicted_ratio_8(yProba_s, L) finals, finalb, s_s, b_s = submission.get_s_b(yPredicted_s, yValidation_s, weightsValidation_s) AMS = hbc.AMS(finals, finalb) if AMS > AMS_max: AMS_max = AMS best_ratio_comb = L return AMS_max, best_ratio_comb
print "Train2 - best ratio : %s - best ams : %f" \ %(', '.join(map(str,best_ratio)), best_ams_train2) print(" ") print "Making predictions on the validation set..." # Prediction of the validation set 2: predProba_Valid2_s = xgBoost.predict_proba(predictor_s, valid_RM_s_2[1]) # Thresholding the predictions: predProba_Valid2 = preTreatment.concatenate_vectors(predProba_Valid2_s) predLabel5_Valid2 = tresholding.get_yPredicted_ratio(predProba_Valid2, best_ratio) # Binarize the prediction: predLabel_Valid2 = preTreatment.multiclass2binary(predLabel5_Valid2) # Concatenate data: yValid2 = preTreatment.concatenate_vectors(valid_RM_s_2[2]) weightsValidation = preTreatment.concatenate_vectors(valid_RM_s_2[3]) # Estimation the AMS: s, b = submission.get_s_b(predLabel_Valid2, yValid2, weightsValidation) s *= 250000/predLabel_Valid2.shape[0] b *= 250000/predLabel_Valid2.shape[0] ams = hbc.AMS(s,b) print "Valid_RM_2 - ratio : %f - best ams : %f" %(best_ratio, ams) print(" ") # Saving the model if it's better:
def train(max_depth, n_rounds): ############### ### IMPORT #### ############### # Importation parameters: split= True normalize = True noise_var = 0. train_size = 200000 train_size2 = 25000 valid_size = 25000 remove_999 = False # Import the training data: print("Extracting the data sets...") start = time.clock() train_s, train2_s, valid_s, test_s = tokenizer.extract_data(split= split, \ normalize= normalize, \ remove_999 = remove_999, \ noise_variance= noise_var, \ n_classes = "multiclass", \ train_size = train_size, \ train_size2 = train_size2, \ valid_size = valid_size) #RANDOM FOREST: #kwargs_grad = {} #kwargs_rdf = {'n_estimators': 100} print "Training on the train set ..." #predictor_s = randomForest.train_classifier(train_s[1], train_s[2], kwargs_rdf) #XGBOOST kwargs_xgb = {'bst_parameters': \ {'booster_type': 0, \ #'objective': 'binary:logitraw', 'objective': 'multi:softprob', 'num_class': 5, 'bst:eta': 0.1, # the bigger the more conservative 'bst:subsample': 1, # prevent over fitting if <1 'bst:max_depth': max_depth, 'eval_metric': 'auc', 'silent': 1, 'nthread': 8 }, \ 'n_rounds': n_rounds} predictor_s = xgBoost.train_classifier(train_s[1], train_s[2], train_s[3], 550000, kwargs_xgb) #TEST / SUBMISSION """ yProbaTest_s = [] yProbaTestBinary_s = [] print "Classifying the test set..." for i in range(8): yProbaTest = xgBoost.predict_proba(predictor_s[i], test_s[1][i]) yProbaTest_s.append(yProbaTest) print "Making the binary proba vector..." for i in range(8): yProbaTestBinary_s.append(np.zeros(yProbaTest_s[i].shape[0])) for i in range(8): for j in range(yProbaTest_s[i].shape[0]): yProbaTestBinary_s[i][j] = 1 - yProbaTest_s[i][j][0] print "Concatenating the vectors..." yProbaTestBinary = preTreatment.concatenate_vectors(yProbaTestBinary_s) IDs = preTreatment.concatenate_vectors(test_s[0]) yProbaTestBinaryRanked = submission.rank_signals(yProbaTestBinary) yPredictedTest = tresholding.get_yPredicted_ratio(yProbaTestBinary, 0.15) s = submission.print_submission(IDs, yProbaTestBinaryRanked, yPredictedTest, "newAMSmesure") """ # TRAIN AND VALID yPredictedTrain2_s = [] yProbaTrain2_s = [] yProbaTrain2Binary_s = [] yPredictedValid_s = [] yProbaValid_s = [] yProbaValidBinary_s = [] print "Classifying the train2 set..." for i in range(8): yProbaTrain2 = xgBoost.predict_proba(predictor_s[i], train2_s[1][i]) yProbaTrain2_s.append(yProbaTrain2) print "Classifying the valid set..." for i in range(8): yProbaValid = xgBoost.predict_proba(predictor_s[i], valid_s[1][i]) yProbaValid_s.append(yProbaValid) print "Making the binary proba vector..." for i in range(8): yProbaTrain2Binary_s.append(np.zeros(yProbaTrain2_s[i].shape[0])) yProbaValidBinary_s.append(np.zeros(yProbaValid_s[i].shape[0])) for i in range(8): for j in range(yProbaTrain2_s[i].shape[0]): yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0] for j in range(yProbaValid_s[i].shape[0]): yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0] print "Concatenating the vectors..." yProbaTrain2Binary = preTreatment.concatenate_vectors(yProbaTrain2Binary_s) yProbaValidBinary = preTreatment.concatenate_vectors(yProbaValidBinary_s) yTrain2 = preTreatment.concatenate_vectors(train2_s[2]) yValid = preTreatment.concatenate_vectors(valid_s[2]) weightsTrain2 = preTreatment.concatenate_vectors(train2_s[3]) weightsValid = preTreatment.concatenate_vectors(valid_s[3]) print "Putting all the real labels to 1" yTrain2 = preTreatment.multiclass2binary(yTrain2) yValid = preTreatment.multiclass2binary(yValid) print "Getting the best ratios..." best_ams_train2_global, best_ratio_global = tresholding.best_ratio(yProbaTrain2Binary, yTrain2, weightsTrain2) #best_ams_train2_combinaison, best_ratio_combinaison = tresholding.best_ratio_combinaison_global(yProbaTrain2Binary_s, train2_s[2], train2_s[3], 1) yPredictedValid = tresholding.get_yPredicted_ratio(yProbaValidBinary, 0.15) yPredictedValid_best_ratio_global = tresholding.get_yPredicted_ratio(yProbaValidBinary, best_ratio_global) #yPredictedValid_best_ratio_combinaison_s, yPredictedValid_best_ratio_combinaison = tresholding.get_yPredicted_ratio_8(yProbaTrain2Binary_s, best_ratio_combinaison) #Let's compute the predicted AMS s, b = submission.get_s_b(yPredictedValid, yValid, weightsValid) AMS = hbc.AMS(s,b) #s_best_ratio_combinaison, b_best_ratio_combinaison = submission.get_s_b(yPredictedValid_best_ratio_combinaison, yValid, weightsValid) #AMS_best_ratio_combinaison = hbc.AMS(s_best_ratio_combinaison, b_best_ratio_combinaison) s_best_ratio_global, b_best_ratio_global = submission.get_s_b(yPredictedValid_best_ratio_global, yValid, weightsValid) AMS_best_ratio_global = hbc.AMS(s_best_ratio_global, b_best_ratio_global) print "AMS 0.15 = %f" %AMS print " " #print "AMS best ratio combi= %f" %AMS_best_ratio_combinaison #print "best AMS train2 ratio combinaison= %f" %best_ams_train2_combinaison #print "best ratio combinaison train 2 = %s" %str(best_ratio_combinaison) print " " print "best AMS valid ratio global= %f" %AMS_best_ratio_global print "best AMS train2 ratio global= %f" %best_ams_train2_global print "best ratio global train2 = %f" %best_ratio_global return AMS