def model_test_data_evaluation(test_data_file, var_list_filename, model_file, output_dir, output_suffix, good_downsample_rate):
    
    #################### Load Model and Evaluate Performance ##################
    ############################### Test Data #################################
    
    # Load Test Data
    print 'Loading test data starts ...'
    t0=time.time()
    target_name='target'  
    X,y = load_data_fast(test_data_file, var_list_filename, target_name)
    print "Loading test data done, taking ",time.time()-t0,"secs"
    
    # Load Model
    print 'Loading model ...'
    t0=time.time()
    model = pickle.load(open(model_file,'rb'))
    
    # Predict Test Data
    y_pred = model.predict(X)
    p_pred = model.predict_proba(X)
    p_pred = p_pred[:,1]

    # Performance Evaluation: Test
    print 'Evalutate model performance ...'
    ks, auc, lorenz_curve_capt_rate = performance_eval_test_downsample(y,p_pred,output_dir,output_suffix,good_downsample_rate)
    
    return ks, auc, lorenz_curve_capt_rate
Esempio n. 2
0
def model_test_data_evaluation_comp_ruletag(test_data_file, var_list_filename, model_file, output_dir, output_suffix, good_downsample_rate):
    
    #################### Load Model and Evaluate Performance ##################
    ############################### Test Data #################################
    # Ad Hoc code
    # compare model results with rules
    
    # Load Test Data
    print 'Loading test data starts ...'
    t0=time.time()
    target_name='target'
    key_name='payment_request_id'
    tag_name='manual_review'
    X,y,key,tag = load_data_with_key_tag_fast(test_data_file, var_list_filename, target_name, key_name, tag_name)
    print "Loading test data done, taking ",time.time()-t0,"secs"
    
    # Load Model
    print 'Loading model ...'
    t0=time.time()
    model = pickle.load(open(model_file,'rb'))
    
    # Predict Test Data
    y_pred = model.predict(X)
    p_pred = model.predict_proba(X)
    p_pred = p_pred[:,1]

    # Performance Evaluation: Test
    print 'Evalutate model performance ...'
    ks, auc, lorenz_curve_capt_rate = performance_eval_test_downsample(y,p_pred,output_dir,output_suffix,good_downsample_rate)
    
    ####################### compare catch_rate, hit_rate, refer_rate between model and rule ######################
    scale_factor = (1-y)*(1/good_downsample_rate)+y
    '''
    rule_cmp_outfile=csv.writer(open(output_dir+"score_ruletag_"+output_suffix+".csv",'w'))
    rule_cmp_outfile.writerow(['payment_request_id','fraud_tag','score','manual_review_tag'])
    for i in range(len(p_pred)):
        rule_cmp_outfile.writerow([key[i],y[i],p_pred[i],tag[i],scale_factor[i]])
    '''
    
    # find rates of rule
    catch_rate_rule = sum(y*tag*scale_factor)/sum(y*scale_factor) # fraud found by rule tag / total fraud
    hit_rate_rule = sum(y*tag*scale_factor)/sum(tag*scale_factor) # fraud found by rule tag / total referred by rule
    refer_rate_rule = sum(tag*scale_factor)/sum(scale_factor) # fraud found by rule tag / total referred by rule
    
    # get score threshold for the same catch rate, and calculate hit_rate and refer_rate
    score_fraud_pmt=p_pred[y==1]
    score_threshold= percentile(score_fraud_pmt,(1-catch_rate_rule)*100) 
    score_referred= p_pred>=score_threshold
    
    catch_rate_score = sum(y*score_referred*scale_factor)/sum(y*scale_factor) # fraud found by score referred / total fraud
    hit_rate_score = sum(y*score_referred*scale_factor)/sum(score_referred*scale_factor) # fraud found by score_referred / total referred by score_referred
    refer_rate_score = sum(score_referred*scale_factor)/sum(scale_factor) # fraud found by score / total referred byscore
    
    rule_model_rates = [catch_rate_rule, hit_rate_rule, refer_rate_rule,catch_rate_score, hit_rate_score, refer_rate_score,score_threshold]
    print ['catch_rate_rule', 'hit_rate_rule', 'refer_rate_rule','catch_rate_score', 'hit_rate_score', 'refer_rate_score', 'score_threshold']
    print rule_model_rates
    
    return ks, auc, lorenz_curve_capt_rate, rule_model_rates
Esempio n. 3
0
def model_test_data_evaluation_comp_ruletag(test_data_file, var_list_filename,
                                            model_file, output_dir,
                                            output_suffix,
                                            good_downsample_rate):

    #################### Load Model and Evaluate Performance ##################
    ############################### Test Data #################################
    # Ad Hoc code
    # compare model results with rules

    # Load Test Data
    print 'Loading test data starts ...'
    t0 = time.time()
    target_name = 'target'
    key_name = 'payment_request_id'
    tag_name = 'manual_review'
    X, y, key, tag = load_data_with_key_tag_fast(test_data_file,
                                                 var_list_filename,
                                                 target_name, key_name,
                                                 tag_name)
    print "Loading test data done, taking ", time.time() - t0, "secs"

    # Load Model
    print 'Loading model ...'
    t0 = time.time()
    model = pickle.load(open(model_file, 'rb'))

    # Predict Test Data
    y_pred = model.predict(X)
    p_pred = model.predict_proba(X)
    p_pred = p_pred[:, 1]

    # Performance Evaluation: Test
    print 'Evalutate model performance ...'
    ks, auc, lorenz_curve_capt_rate = performance_eval_test_downsample(
        y, p_pred, output_dir, output_suffix, good_downsample_rate)

    return ks, auc, lorenz_curve_capt_rate
Esempio n. 4
0
field_types=df.dtypes
X=df[var_list]
y=df['target']

# fit
model=RandomForestClassifier(max_depth=None, n_estimators=200, max_features="auto",random_state=0,n_jobs=-1)
model.fit(X,y)

# pred
y_pred = model.predict(X)
p_pred = model.predict_proba(X)
p_pred = p_pred[:,1]

# performance train
output_suffix="train"
ks, auc, lorenz_curve_capt_rate = performance_eval_test_downsample(y,p_pred,output_dir,output_suffix,good_downsample_rate=1)


###################### test ######################
# load test data
dft = pd.read_csv('../data/data2_imp_woe.csv.gz',compression="gzip")
Xt=dft[var_list]
yt=dft['target']

# pred test
yt_pred = model.predict(Xt)
pt_pred = model.predict_proba(Xt)
pt_pred = pt_pred[:,1]

# performance test
output_suffix="test"