def model_test_data_evaluation(test_data_file, var_list_filename, model_file, output_dir, output_suffix, good_downsample_rate): #################### Load Model and Evaluate Performance ################## ############################### Test Data ################################# # Load Test Data print 'Loading test data starts ...' t0=time.time() target_name='target' X,y = load_data_fast(test_data_file, var_list_filename, target_name) print "Loading test data done, taking ",time.time()-t0,"secs" # Load Model print 'Loading model ...' t0=time.time() model = pickle.load(open(model_file,'rb')) # Predict Test Data y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:,1] # Performance Evaluation: Test print 'Evalutate model performance ...' ks, auc, lorenz_curve_capt_rate = performance_eval_test_downsample(y,p_pred,output_dir,output_suffix,good_downsample_rate) return ks, auc, lorenz_curve_capt_rate
def model_test_data_evaluation_comp_ruletag(test_data_file, var_list_filename, model_file, output_dir, output_suffix, good_downsample_rate): #################### Load Model and Evaluate Performance ################## ############################### Test Data ################################# # Ad Hoc code # compare model results with rules # Load Test Data print 'Loading test data starts ...' t0=time.time() target_name='target' key_name='payment_request_id' tag_name='manual_review' X,y,key,tag = load_data_with_key_tag_fast(test_data_file, var_list_filename, target_name, key_name, tag_name) print "Loading test data done, taking ",time.time()-t0,"secs" # Load Model print 'Loading model ...' t0=time.time() model = pickle.load(open(model_file,'rb')) # Predict Test Data y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:,1] # Performance Evaluation: Test print 'Evalutate model performance ...' ks, auc, lorenz_curve_capt_rate = performance_eval_test_downsample(y,p_pred,output_dir,output_suffix,good_downsample_rate) ####################### compare catch_rate, hit_rate, refer_rate between model and rule ###################### scale_factor = (1-y)*(1/good_downsample_rate)+y ''' rule_cmp_outfile=csv.writer(open(output_dir+"score_ruletag_"+output_suffix+".csv",'w')) rule_cmp_outfile.writerow(['payment_request_id','fraud_tag','score','manual_review_tag']) for i in range(len(p_pred)): rule_cmp_outfile.writerow([key[i],y[i],p_pred[i],tag[i],scale_factor[i]]) ''' # find rates of rule catch_rate_rule = sum(y*tag*scale_factor)/sum(y*scale_factor) # fraud found by rule tag / total fraud hit_rate_rule = sum(y*tag*scale_factor)/sum(tag*scale_factor) # fraud found by rule tag / total referred by rule refer_rate_rule = sum(tag*scale_factor)/sum(scale_factor) # fraud found by rule tag / total referred by rule # get score threshold for the same catch rate, and calculate hit_rate and refer_rate score_fraud_pmt=p_pred[y==1] score_threshold= percentile(score_fraud_pmt,(1-catch_rate_rule)*100) score_referred= p_pred>=score_threshold catch_rate_score = sum(y*score_referred*scale_factor)/sum(y*scale_factor) # fraud found by score referred / total fraud hit_rate_score = sum(y*score_referred*scale_factor)/sum(score_referred*scale_factor) # fraud found by score_referred / total referred by score_referred refer_rate_score = sum(score_referred*scale_factor)/sum(scale_factor) # fraud found by score / total referred byscore rule_model_rates = [catch_rate_rule, hit_rate_rule, refer_rate_rule,catch_rate_score, hit_rate_score, refer_rate_score,score_threshold] print ['catch_rate_rule', 'hit_rate_rule', 'refer_rate_rule','catch_rate_score', 'hit_rate_score', 'refer_rate_score', 'score_threshold'] print rule_model_rates return ks, auc, lorenz_curve_capt_rate, rule_model_rates
def model_test_data_evaluation_comp_ruletag(test_data_file, var_list_filename, model_file, output_dir, output_suffix, good_downsample_rate): #################### Load Model and Evaluate Performance ################## ############################### Test Data ################################# # Ad Hoc code # compare model results with rules # Load Test Data print 'Loading test data starts ...' t0 = time.time() target_name = 'target' key_name = 'payment_request_id' tag_name = 'manual_review' X, y, key, tag = load_data_with_key_tag_fast(test_data_file, var_list_filename, target_name, key_name, tag_name) print "Loading test data done, taking ", time.time() - t0, "secs" # Load Model print 'Loading model ...' t0 = time.time() model = pickle.load(open(model_file, 'rb')) # Predict Test Data y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:, 1] # Performance Evaluation: Test print 'Evalutate model performance ...' ks, auc, lorenz_curve_capt_rate = performance_eval_test_downsample( y, p_pred, output_dir, output_suffix, good_downsample_rate) return ks, auc, lorenz_curve_capt_rate
field_types=df.dtypes X=df[var_list] y=df['target'] # fit model=RandomForestClassifier(max_depth=None, n_estimators=200, max_features="auto",random_state=0,n_jobs=-1) model.fit(X,y) # pred y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:,1] # performance train output_suffix="train" ks, auc, lorenz_curve_capt_rate = performance_eval_test_downsample(y,p_pred,output_dir,output_suffix,good_downsample_rate=1) ###################### test ###################### # load test data dft = pd.read_csv('../data/data2_imp_woe.csv.gz',compression="gzip") Xt=dft[var_list] yt=dft['target'] # pred test yt_pred = model.predict(Xt) pt_pred = model.predict_proba(Xt) pt_pred = pt_pred[:,1] # performance test output_suffix="test"