コード例 #1
0
def load_if_exists(STORED_PICKLE_FILE, filepaths_dict):


    # load if the data already exists...
    # if os.path.isfile(STORED_PICKLE_FILE):
    if False:
        print("loading pickled file...")
        metrics_file = open(STORED_PICKLE_FILE, 'rb')
        up_to_dataset_dict = pickle.load(metrics_file)

    else:
        print("creating data...")
        store_results = {}
        for label, inner_dict in filepaths_dict.items():

            print(label)
            X_test, y_test, xgb_model  = get_test_performance(inner_dict['input_file'], inner_dict['model_file'])
            metrics_results, metrics_df, model_params = validate_best_model(xgb_model, X_test, y_test)

            filepaths_dict[label]['metrics_df'] = metrics_df
            filepaths_dict[label]['metrics_results'] = metrics_results
            filepaths_dict[label]['y_test'] = y_test
            filepaths_dict[label]['X_test'] = X_test


        pickle.dump(filepaths_dict, open(STORED_PICKLE_FILE, 'wb'))
        print("pickled model.")

    return filepaths_dict
コード例 #2
0
# load if the data already exists...
# UP TO
if os.path.isfile(UPTO_STORED_DATA_FILE):
    print("loading pickled file...")
    metrics_file = open(UPTO_STORED_DATA_FILE, 'rb')
    up_to_dataset_dict = pickle.load(metrics_file)
else:
    print("creating data...")
    store_results = {}
    for label, inner_dict in up_to_dataset_dict.items():

        print(label)
        X_test, y_test, xgb_model = get_test_performance(
            inner_dict['input_file'], inner_dict['model_file'])
        metrics_results, metrics_df, model_params = validate_best_model(
            xgb_model, X_test, y_test)

        up_to_dataset_dict[label]['metrics_df'] = metrics_df
        up_to_dataset_dict[label]['metrics_results'] = metrics_results
        up_to_dataset_dict[label]['y_test'] = y_test
        up_to_dataset_dict[label]['X_test'] = X_test

mod_names_up_to = dict(
    zip(up_to_dataset_dict.keys(), [
        'up_to_{}_before_delivery'.format(x)
        for x in up_to_dataset_dict.keys()
    ]))

# In[ ]:

# SET UP FIGURE PARAMTERS ....
コード例 #3
0
###    PATHS
###
DATE = datetime.now().strftime('%Y-%m-%d')


ROOT_DATA_DIR = "/dors/capra_lab/users/abraha1/projects/PTB_phenotyping/results/ptb_predict_machine_learning/2019_09_06_2nd_ptb_icd_cpt/equal_sample_size"
OUTPUT_DIR =    "/dors/capra_lab/users/abraha1/projects/PTB_phenotyping/scripts/rand_forest_ptb_classification/manuscript/second_delivery"

# load input and model fiel
up_to_dataset_dict = OrderedDict()
for num_weeks in ['0', '90', '365']:

    input_file = os.path.join(ROOT_DATA_DIR,'eq_up_to_{0}d_before_second_delivery/input_data_eq_samp_size_raw_counts_icd_cpt_up_to_{0}_days_before_second_delivery-2019-09-08.tsv'.format(num_weeks))
    model_file = os.path.join(ROOT_DATA_DIR,'eq_up_to_{0}d_before_second_delivery/best_xgb_model_eq_samp_size_raw_counts_icd_cpt_up_to_{0}_days_before_second_delivery-2019-09-08.pickle'.format(num_weeks))
    _, _, X_test, y_test, xgb_model, input_data = unpack_input_data(input_file, model_file)
    metrics_results, _, _ = validate_best_model(xgb_model, X_test, y_test)
    interp_rc, interp_pr, pr_auc, pos_prop = get_pr_coord(metrics_results, y_test)


    up_to_dataset_dict['{}_days'.format(num_weeks)] =  {'interp_rc': interp_rc, 'interp_pr': interp_pr, 'pr_auc':pr_auc, 'pos_prop':pos_prop}


###
###    FUNCTIONS
###

def get_auroc_coords(metric_results):

    # unpack data
    metrics_results = metric_results
    fpr = metrics_results['fpr']
コード例 #4
0
# set up paths
timeseries = ['0_weeks', '13_weeks','28_weeks', '35_weeks', '37_weeks']

roc_dict = dict()
pr_dict = dict()
f1_score=dict()
for timepoint in timeseries: 
    results_dir = os.path.join(ICDCPT_DIR, f'{timepoint}_notwins_timeseries_v1')
    input_file = glob.glob(results_dir+"/input_data*.tsv")[0]
    model_file = glob.glob(results_dir+"/best_xgb_model*.pickle")[0]
    
    
    # load models and input files
    _, _, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data(input_file, model_file)
    ehr_metrics_results, ehr_metrics_df, _ = validate_best_model(ehr_xgb_model, ehr_X_test, ehr_y_test)

    ehr_interp_fpr, ehr_interp_tpr, ehr_auc = get_auroc_coords(ehr_metrics_results)
    temp_roc_dict = {'interp_fpr':ehr_interp_fpr, 'interp_tpr':ehr_interp_tpr, 'auc':ehr_auc}
    roc_dict[timepoint] = temp_roc_dict
    f1_score[timepoint] = {'f1_score': ehr_metrics_results['f1_score'], 'pr_score': ehr_metrics_results['pr_score'], 'rc_score': ehr_metrics_results['rc_score']}
    
    ehr_interp_rc, ehr_interp_pr, ehr_pr_auc, ehr_pos_prop = get_pr_coord(ehr_metrics_results, ehr_y_test)
    temp_pr_dict = {'interp_rc':ehr_interp_rc, 'interp_pr':ehr_interp_pr, 'pr_auc':ehr_pr_auc, 'pos_prop':ehr_pos_prop}
    pr_dict[timepoint] = temp_pr_dict
    break
    
roc_dict    


コード例 #5
0
ehr_model_file = os.path.join(
    RF_DIR,
    'best_xgb_model_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.pickle'
)
riskfx_model_file = os.path.join(
    CLIN_RISK_DIR,
    'best_xgb_model_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.pickle')

# load models and input files
_, _, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data(
    ehr_input_file, ehr_model_file)
_, _, riskfac_X_test, riskfac_y_test, riskfac_xgb_model, riskfac_input_data = unpack_input_data(
    riskfx_input_file, riskfx_model_file)

ehr_metrics_results, ehr_metrics_df, _ = validate_best_model(
    ehr_xgb_model, ehr_X_test, ehr_y_test)
riskfac_metrics_results, riskfac_metrics_df, _ = validate_best_model(
    riskfac_xgb_model, riskfac_X_test, riskfac_y_test)

###
###    plot
###
# %%
# fig paramaters
sns.set(style='whitegrid', font_scale=1.5, rc={'figure.figsize': (6, 6)})
sns.set_style({
    'axes.grid': True,
    'axes.edgecolor': 'k',
    'grid.color': '#e1e1e1'
})
fsize = 14
コード例 #6
0
cs_model_file = os.path.join(
    CSEC_DIR,
    'best_xgb_model_csection_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.pickle'
)
vg_model_file = os.path.join(
    VG_DIR,
    'best_xgb_model_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.pickle'
)

# load models and input files
_, _, cs_X_test, cs_y_test, cs_xgb_model, cs_input_data = unpack_input_data(
    cs_input_file, cs_model_file)
_, _, vg_X_test, vg_y_test, vg_xgb_model, vg_input_data = unpack_input_data(
    vg_input_file, vg_model_file)

cs_metrics_results, cs_metrics_df, _ = validate_best_model(
    cs_xgb_model, cs_X_test, cs_y_test)
vg_metrics_results, vg_metrics_df, _ = validate_best_model(
    vg_xgb_model, vg_X_test, vg_y_test)

# %%
###
###    plot
###

# plot - PR
mult = 1
sns.set(style='ticks',
        context='paper',
        font_scale=1.0,
        rc={'figure.figsize': (2.2 * mult, 2.2 * mult)})
sns.set_style({