def train(max_depth, n_rounds): ############### ### IMPORT #### ############### # Importation parameters: split= True normalize = True noise_var = 0. train_size = 200000 train_size2 = 25000 valid_size = 25000 remove_999 = False # Import the training data: print("Extracting the data sets...") start = time.clock() train_s, train2_s, valid_s, test_s = tokenizer.extract_data(split= split, \ normalize= normalize, \ remove_999 = remove_999, \ noise_variance= noise_var, \ n_classes = "multiclass", \ train_size = train_size, \ train_size2 = train_size2, \ valid_size = valid_size) #RANDOM FOREST: #kwargs_grad = {} #kwargs_rdf = {'n_estimators': 100} print "Training on the train set ..." #predictor_s = randomForest.train_classifier(train_s[1], train_s[2], kwargs_rdf) #XGBOOST kwargs_xgb = {'bst_parameters': \ {'booster_type': 0, \ #'objective': 'binary:logitraw', 'objective': 'multi:softprob', 'num_class': 5, 'bst:eta': 0.1, # the bigger the more conservative 'bst:subsample': 1, # prevent over fitting if <1 'bst:max_depth': max_depth, 'eval_metric': 'auc', 'silent': 1, 'nthread': 8 }, \ 'n_rounds': n_rounds} predictor_s = xgBoost.train_classifier(train_s[1], train_s[2], train_s[3], 550000, kwargs_xgb) #TEST / SUBMISSION """ yProbaTest_s = [] yProbaTestBinary_s = [] print "Classifying the test set..." for i in range(8): yProbaTest = xgBoost.predict_proba(predictor_s[i], test_s[1][i]) yProbaTest_s.append(yProbaTest) print "Making the binary proba vector..." for i in range(8): yProbaTestBinary_s.append(np.zeros(yProbaTest_s[i].shape[0])) for i in range(8): for j in range(yProbaTest_s[i].shape[0]): yProbaTestBinary_s[i][j] = 1 - yProbaTest_s[i][j][0] print "Concatenating the vectors..." yProbaTestBinary = preTreatment.concatenate_vectors(yProbaTestBinary_s) IDs = preTreatment.concatenate_vectors(test_s[0]) yProbaTestBinaryRanked = submission.rank_signals(yProbaTestBinary) yPredictedTest = tresholding.get_yPredicted_ratio(yProbaTestBinary, 0.15) s = submission.print_submission(IDs, yProbaTestBinaryRanked, yPredictedTest, "newAMSmesure") """ # TRAIN AND VALID yPredictedTrain2_s = [] yProbaTrain2_s = [] yProbaTrain2Binary_s = [] yPredictedValid_s = [] yProbaValid_s = [] yProbaValidBinary_s = [] print "Classifying the train2 set..." for i in range(8): yProbaTrain2 = xgBoost.predict_proba(predictor_s[i], train2_s[1][i]) yProbaTrain2_s.append(yProbaTrain2) print "Classifying the valid set..." for i in range(8): yProbaValid = xgBoost.predict_proba(predictor_s[i], valid_s[1][i]) yProbaValid_s.append(yProbaValid) print "Making the binary proba vector..." for i in range(8): yProbaTrain2Binary_s.append(np.zeros(yProbaTrain2_s[i].shape[0])) yProbaValidBinary_s.append(np.zeros(yProbaValid_s[i].shape[0])) for i in range(8): for j in range(yProbaTrain2_s[i].shape[0]): yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0] for j in range(yProbaValid_s[i].shape[0]): yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0] print "Concatenating the vectors..." yProbaTrain2Binary = preTreatment.concatenate_vectors(yProbaTrain2Binary_s) yProbaValidBinary = preTreatment.concatenate_vectors(yProbaValidBinary_s) yTrain2 = preTreatment.concatenate_vectors(train2_s[2]) yValid = preTreatment.concatenate_vectors(valid_s[2]) weightsTrain2 = preTreatment.concatenate_vectors(train2_s[3]) weightsValid = preTreatment.concatenate_vectors(valid_s[3]) print "Putting all the real labels to 1" yTrain2 = preTreatment.multiclass2binary(yTrain2) yValid = preTreatment.multiclass2binary(yValid) print "Getting the best ratios..." best_ams_train2_global, best_ratio_global = tresholding.best_ratio(yProbaTrain2Binary, yTrain2, weightsTrain2) #best_ams_train2_combinaison, best_ratio_combinaison = tresholding.best_ratio_combinaison_global(yProbaTrain2Binary_s, train2_s[2], train2_s[3], 1) yPredictedValid = tresholding.get_yPredicted_ratio(yProbaValidBinary, 0.15) yPredictedValid_best_ratio_global = tresholding.get_yPredicted_ratio(yProbaValidBinary, best_ratio_global) #yPredictedValid_best_ratio_combinaison_s, yPredictedValid_best_ratio_combinaison = tresholding.get_yPredicted_ratio_8(yProbaTrain2Binary_s, best_ratio_combinaison) #Let's compute the predicted AMS s, b = submission.get_s_b(yPredictedValid, yValid, weightsValid) AMS = hbc.AMS(s,b) #s_best_ratio_combinaison, b_best_ratio_combinaison = submission.get_s_b(yPredictedValid_best_ratio_combinaison, yValid, weightsValid) #AMS_best_ratio_combinaison = hbc.AMS(s_best_ratio_combinaison, b_best_ratio_combinaison) s_best_ratio_global, b_best_ratio_global = submission.get_s_b(yPredictedValid_best_ratio_global, yValid, weightsValid) AMS_best_ratio_global = hbc.AMS(s_best_ratio_global, b_best_ratio_global) print "AMS 0.15 = %f" %AMS print " " #print "AMS best ratio combi= %f" %AMS_best_ratio_combinaison #print "best AMS train2 ratio combinaison= %f" %best_ams_train2_combinaison #print "best ratio combinaison train 2 = %s" %str(best_ratio_combinaison) print " " print "best AMS valid ratio global= %f" %AMS_best_ratio_global print "best AMS train2 ratio global= %f" %best_ams_train2_global print "best ratio global train2 = %f" %best_ratio_global return AMS
yTrain2, weightsTrain2) print "Train2 - best ratio : %s - best ams : %f" \ %(', '.join(map(str,best_ratio)), best_ams_train2) print(" ") print "Making predictions on the validation set..." # Prediction of the validation set 2: predProba_Valid2_s = xgBoost.predict_proba(predictor_s, valid_RM_s_2[1]) # Thresholding the predictions: predProba_Valid2 = preTreatment.concatenate_vectors(predProba_Valid2_s) predLabel5_Valid2 = tresholding.get_yPredicted_ratio(predProba_Valid2, best_ratio) # Binarize the prediction: predLabel_Valid2 = preTreatment.multiclass2binary(predLabel5_Valid2) # Concatenate data: yValid2 = preTreatment.concatenate_vectors(valid_RM_s_2[2]) weightsValidation = preTreatment.concatenate_vectors(valid_RM_s_2[3]) # Estimation the AMS: s, b = submission.get_s_b(predLabel_Valid2, yValid2, weightsValidation) s *= 250000/predLabel_Valid2.shape[0] b *= 250000/predLabel_Valid2.shape[0] ams = hbc.AMS(s,b) print "Valid_RM_2 - ratio : %f - best ams : %f" %(best_ratio, ams)
def analyse(train_s, train2_s, valid_s, method_name, kwargs={}): """ methode name = string, name of the method (eg :"naiveBayes") kwargs = dictionnary of the paraters of the method train_s = training set for the classifier(s) train2_s = training set for the meta parameters (eg : the best treshold) valid_s : validation set None of the set must be empty ! """ # Prediction on the validation set: print("------------------- Analyse: %s -----------------------") \ %(method_name) classifier_s = eval(method_name).train_classifier(train_s[1], train_s[2], kwargs) yProbaTrain2_s = eval(method_name).predict_proba(classifier_s, train2_s[1]) yProbaValid_s = eval(method_name).predict_proba(classifier_s, valid_s[1]) # Convert the validations vectors four 's' classes into one single s # classe if type(valid_s[2]) == list: for i in range(len(valid_s[2])): for j in range(valid_s[2][i].shape[0]): if valid_s[2][i][j] >=1: valid_s[2][i][j] = 1 # Convert the train2 vectors four 's' classes into one single s # classe if type(train2_s[2]) == list: for i in range(len(train2_s[2])): for j in range(train2_s[2][i].shape[0]): if train2_s[2][i][j] >=1: train2_s[2][i][j] = 1 # Let's define the vectors of probabilities of being 's' # Train2 set if type(yProbaTrain2_s) == list: yProbaTrain2Binary_s = [] for i in range(8): yProbaTrain2Binary_s.append(np.zeros(len(yProbaTrain2_s[i][:,1]))) for i in range(8): for j in range(len(yProbaTrain2_s[i][:,1])): yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0] else: yProbaTrain2Binary_s = np.zeros(len(yProbaTrain2_s[i][:,1])) for j in range(len(yProbaTrain2_s[i][:,1])): yProbaTrain2Binary_s[j] = 1 - yProbaTrain2_s[j][0] # Validation set if type(yProbaValid_s) == list: yProbaValidBinary_s = [] for i in range(8): yProbaValidBinary_s.append(np.zeros(len(yProbaValid_s[i][:,1]))) for i in range(8): for j in range(len(yProbaValid_s[i][:,1])): yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0] else: yProbaValidBinary_s = np.zeros(len(yProbaValid_s[i][:,1])) for j in range(len(yProbaValid_s[i][:,1])): yProbaValidBinary_s[j] = 1 - yProbaValid_s[j][0] # If we work with lists, let's get the concatenated vectors: # TRAIN SET if type(train_s[3]) ==list: weightsTrain_conca = preTreatment.concatenate_vectors(train_s[3]) else: weightsTrain_conca = train_s[3] # VALID SET # Validation Vectors if type(valid_s[2]) == list: yValid_conca = preTreatment.concatenate_vectors(valid_s[2]) else: yValid_conca = valid_s[2] # Weights Vectors if type(valid_s[3]) == list: weightsValid_conca = preTreatment.concatenate_vectors(valid_s[3]) else: weightsValid_conca = valid_s[3] # Binary Proba Vectors if type(yProbaValidBinary_s) == list: yProbaValidBinary_conca = preTreatment.concatenate_vectors( yProbaValidBinary_s) else: yProbaValidBinary_conca = yProbaValidBinary_s # All Proba Vectors if type(yProbaValid_s) == list: yProbaValid_conca = preTreatment.concatenate_vectors(yProbaValid_s) else: yProbaValid_conca = yProbaValid_s #TRAIN2 SET # Validation Vectors if type(train2_s[2]) == list: yTrain2_conca = preTreatment.concatenate_vectors(train2_s[2]) else: yTrain2_conca = train2_s[2] # Weights Vectors if type(train2_s[3]) == list: weightsTrain2_conca = preTreatment.concatenate_vectors(train2_s[3]) else: weightsTrain2_conca = train2_s[3] # Binary Proba Vectors if type(yProbaTrain2Binary_s) == list: yProbaTrain2Binary_conca = preTreatment.concatenate_vectors( yProbaTrain2Binary_s) else: yProbaTrain2Binary_conca = yProbaTrain2Binary_s # All Proba Vectors if type(yProbaTrain2_s) == list: yProbaTrain2_conca = preTreatment.concatenate_vectors(yProbaTrain2_s) else: yProbaTrain2_conca = yProbaTrain2_s # Let's rebalance the weight so their sum is equal to the total sum # of the train set sumWeightsTotal = sum(weightsTrain_conca)+sum(weightsTrain2_conca)+sum(weightsValid_conca) weightsTrain2_conca *= sumWeightsTotal/sum(weightsTrain2_conca) weightsValid_conca *= sumWeightsTotal/sum(weightsValid_conca) for i in range(8): train2_s[3][i] *= sumWeightsTotal/sum(weightsTrain2_conca) valid_s[3][i] *= sumWeightsTotal/sum(weightsValid_conca) # Let's get the best global treshold on the train2 set AMS_treshold_train2, best_treshold_global = tresholding.\ best_treshold(yProbaTrain2Binary_conca, yTrain2_conca, weightsTrain2_conca) yPredictedValid_conca_treshold = tresholding.get_yPredicted_treshold( yProbaValidBinary_conca, best_treshold_global) # Let's get the best ratio treshold on the train2 set AMS_ratio_global_train2, best_ratio_global = tresholding.\ best_ratio(yProbaTrain2Binary_conca, yTrain2_conca, weightsTrain2_conca) yPredictedValid_conca_ratio_global = tresholding.get_yPredicted_ratio( yProbaValidBinary_conca, best_ratio_global) # Let's get the best ratios combinaison if type(train_s[2]) == list: AMS_ratio_combinaison_train2, best_ratio_combinaison = tresholding.\ best_ratio_combinaison_global( yProbaTrain2Binary_s, train2_s[2], train2_s[3], 30) yPredictedValid_ratio_comb_s, yPredictedValid_conca_ratio_combinaison =\ tresholding.get_yPredicted_ratio_8( yProbaValidBinary_s, best_ratio_combinaison) # Let's compute the final s and b for each method s_treshold, b_treshold = submission.get_s_b( yPredictedValid_conca_treshold, yValid_conca, weightsValid_conca) s_ratio_global, b_ratio_global = submission.get_s_b( yPredictedValid_conca_ratio_global, yValid_conca, weightsValid_conca) if type(train_s[2]) == list: s_ratio_combinaison, b_ratio_combinaison = submission.get_s_b( yPredictedValid_conca_ratio_combinaison, yValid_conca, weightsValid_conca) # AMS final: AMS_treshold_valid = hbc.AMS(s_treshold, b_treshold) AMS_ratio_global_valid = hbc.AMS(s_ratio_global, b_ratio_global) if type(train_s[2]) == list: AMS_ratio_combinaison_valid = hbc.AMS(s_ratio_combinaison, b_ratio_combinaison) """ #AMS by group: if type(train_s[2]) == list: AMS_s = [] for i, (s,b) in enumerate(zip(s_s, b_s)): s *= 250000/yPredictedValid_s[i].shape[0] b *= 250000/yPredictedValid_s[i].shape[0] score = hbc.AMS(s,b) AMS_s.append(score) """ # Classification error: classif_succ_treshold = eval(method_name).get_classification_error( yPredictedValid_conca_treshold, yValid_conca, normalize= True) classif_succ_ratio_global = eval(method_name).get_classification_error( yPredictedValid_conca_ratio_global, yValid_conca, normalize= True) classif_succ_ratio_combinaison = eval(method_name).get_classification_error( yPredictedValid_conca_ratio_combinaison, yValid_conca, normalize= True) # Numerical score: """ if type(yProbaValid_s) == list: sum_s_treshold_s = [] sum_b_treshold_s = [] sum_s_ratio_global_s = [] sum_b_ratio_global_s = [] sum_s_ratio_combinaison_s = [] sum_b_ratio_combinaison_s = [] for i in range(len(yPredictedValid_s)): # treshold sum_s_treshold, sum_b_treshold = submission.get_numerical_score(yPredictedValid_conca_treshold_s[i], valid_s[2][i]) sum_s_treshold_s.append(sum_s) sum_b_treshold_s.append(sum_b) # ratio global sum_s_ratio_global, sum_b_ratio_global = submission.get_numerical_score(yPredictedValid_conca_ratio_global_s[i], valid_s[2][i]) sum_s_ratio_global_s.append(sum_s_ratio_global) sum_b_ratio_global_s.append(sum_b_ratio_global) # ratio combinaison sum_s_ratio_combinaison, sum_b_ratio_combinaison = submission.get_numerical_score(yPredictedValid_conca_ratio_combinaison_s[i], valid_s[2][i]) sum_s_ratio_combinaison_s.append(sum_s_ratio_combinaison) sum_b_ratio_combinaison_s.append(sum_b_ratio_combinaison) else: sum_s, sum_b = submission.get_numerical_score(yPredictedValid_s, valid_s[2]) """ d = {'classifier_s':classifier_s, 'yPredictedValid_conca_treshold': yPredictedValid_conca_treshold, 'yPredictedValid_conca_ratio_global' : \ yPredictedValid_conca_ratio_global, 'yProbaTrain2_s': yProbaTrain2_s, 'yProbaTrain2Binary_s': yProbaTrain2Binary_s, 'yProbaTrain2_conca': yProbaTrain2_conca, 'yProbaTrain2Binary_conca': yProbaTrain2Binary_conca, 'yProbaValid_s':yProbaValid_s, 'yProbaValidBinary_s':yProbaValidBinary_s, 'yProbaValid_conca':yProbaValid_conca, 'yProbaValidBinary_conca': yProbaValidBinary_conca, 'AMS_treshold_train2':AMS_treshold_train2, 'AMS_ratio_global_train2':AMS_ratio_global_train2, 'AMS_treshold_valid':AMS_treshold_valid, 'AMS_ratio_global_valid':AMS_ratio_global_valid, 'best_treshold_global' : best_treshold_global, 'best_ratio_global':best_ratio_global, 'classif_succ_treshold': classif_succ_treshold, 'classif_succ_ratio_global': classif_succ_ratio_global, 'method': method_name, 'parameters': kwargs} if type(train_s[2])==list: d['yPredictedValid_conca_ratio_combinaison'] = yPredictedValid_conca_ratio_combinaison d['AMS_ratio_combinaison_train2'] = AMS_ratio_combinaison_train2 d['AMS_ratio_combinaison_valid'] = AMS_ratio_combinaison_valid, d['best_ratio_combinaison'] = best_ratio_combinaison, d['classif_succ_ratio_combinaison'] = classif_succ_ratio_combinaison return d