def model_test_data_evaluation(test_data_file, var_list_filename, model_file, output_dir, output_suffix): #################### Load Model and Evaluate Performance ################## ############################### Test Data ################################# # Load Test Data print 'Loading test data starts ...' t0=time.time() target_name='target' X,y = load_data_fast(test_data_file, var_list_filename, target_name) print "Loading test data done, taking ",time.time()-t0,"secs" # Load Model print 'Loading model ...' t0=time.time() model = pickle.load(open(model_file,'rb')) # Predict Test Data y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:,1] # Performance Evaluation: Test print 'Evalutate model performance ...' ks, auc, lorenz_curve_capt_rate = performance_eval_test(y,p_pred,output_dir,output_suffix) return ks, auc, lorenz_curve_capt_rate
def model_test_data_evaluation_comp_ruletag(test_data_file, var_list_filename, model_file, output_dir, output_suffix, good_downsample_rate): #################### Load Model and Evaluate Performance ################## ############################### Test Data ################################# # Ad Hoc code # compare model results with rules # Load Test Data print 'Loading test data starts ...' t0=time.time() target_name='target' key_name='payment_request_id' tag_name='manual_review' X,y,key,tag = load_data_with_key_tag_fast(test_data_file, var_list_filename, target_name, key_name, tag_name) print "Loading test data done, taking ",time.time()-t0,"secs" # Load Model print 'Loading model ...' t0=time.time() model = pickle.load(open(model_file,'rb')) # Predict Test Data y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:,1] # Performance Evaluation: Test print 'Evalutate model performance ...' ks, auc, lorenz_curve_capt_rate = performance_eval_test(y,p_pred,output_dir,output_suffix) ####################### compare catch_rate, hit_rate, refer_rate between model and rule ###################### scale_factor = (1-y)*(1/good_downsample_rate)+y rule_cmp_outfile=csv.writer(open(output_dir+"score_ruletag_"+output_suffix+".csv",'w')) rule_cmp_outfile.writerow(['payment_request_id','fraud_tag','score','manual_review_tag']) for i in range(len(p_pred)): rule_cmp_outfile.writerow([key[i],y[i],p_pred[i],tag[i],scale_factor[i]]) # find rates of rule catch_rate_rule = sum(y*tag*scale_factor)/sum(y*scale_factor) # fraud found by rule tag / total fraud hit_rate_rule = sum(y*tag*scale_factor)/sum(tag*scale_factor) # fraud found by rule tag / total referred by rule refer_rate_rule = sum(tag*scale_factor)/sum(scale_factor) # fraud found by rule tag / total referred by rule # get score threshold for the same catch rate, and calculate hit_rate and refer_rate score_fraud_pmt=p_pred[y==1] score_threshold= percentile(score_fraud_pmt,(1-catch_rate_rule)*100) score_referred= p_pred>=score_threshold catch_rate_score = sum(y*score_referred*scale_factor)/sum(y*scale_factor) # fraud found by score referred / total fraud hit_rate_score = sum(y*score_referred*scale_factor)/sum(score_referred*scale_factor) # fraud found by score_referred / total referred by score_referred refer_rate_score = sum(score_referred*scale_factor)/sum(scale_factor) # fraud found by score / total referred byscore rule_model_rates = [catch_rate_rule, hit_rate_rule, refer_rate_rule,catch_rate_score, hit_rate_score, refer_rate_score,score_threshold] print ['catch_rate_rule', 'hit_rate_rule', 'refer_rate_rule','catch_rate_score', 'hit_rate_score', 'refer_rate_score', 'score_threshold'] print rule_model_rates return ks, auc, lorenz_curve_capt_rate, rule_model_rates
def model_test_data_evaluation(test_data_file, var_list_filename, model_file, result_dir, output_suffix): #################### Load Model and Evaluate Performance ################## ############################### Test Data ################################# # Load Test Data print 'Loading test data starts ...' t0 = time.time() target_name = 'target' key_name = 'payment_request_id' X, y, key = load_data_with_key(test_data_file, var_list_filename, target_name, key_name) print "Loading test data done, taking ", time.time() - t0, "secs" # Load Model print 'Loading model ...' t0 = time.time() model = pickle.load(open(model_file, 'rb')) # Predict Test Data y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:, 1] # Output test data score results result_key_score = zip(key, p_pred) score_file = open(result_dir + "score_" + output_suffix, "w") score_csv = csv.writer(score_file) score_csv.writerow(["payment_request_id", "score"]) for row in result_key_score: score_csv.writerow(row) score_file.close() # Performance Evaluation: Test print 'Evalutate model performance ...' performance_eval_test(y, p_pred, result_dir, output_suffix)