Esempio n. 1
0
def eval_classifier(method,
                    train_data,
                    train_class,
                    test_data,
                    test_class,
                    LM_params=get_ML_parameters(),
                    positive_roc_index=1):
    global have_written_params_to_file
    if have_written_params_to_file is False:
        logging.info("Run settings for models:")
        logging.info(str(LM_params))
        logging.info("First run method: " + str(method))
        have_written_params_to_file = True

    # set classifier method
    if method == 'svm':
        clf = SVC(random_state=0, probability=True, **LM_params['svm'])
    else:
        clf = set_up_classifier(method, 0, LM_params)

    clf = OneVsRestClassifier(clf)

    clf.fit(train_data, train_class)
    probas_ = clf.predict_proba(test_data)
    preds = clf.predict(test_data)
    fpr, tpr, thresholds = roc_curve(test_class,
                                     probas_[:, positive_roc_index],
                                     pos_label=positive_roc_index)
    roc_auc = auc(fpr, tpr)
    roc_auc = max(roc_auc, 1 - roc_auc)

    return preds, roc_auc
Esempio n. 2
0
def run_rfe_classifier(method,
                       train_data,
                       train_class,
                       test_data,
                       CV_=0,
                       fraction_feat_to_keep=0.1,
                       LM_params=get_ML_parameters()):
    global have_written_params_to_file
    if have_written_params_to_file is False:
        logging.info("Run settings for models:")
        logging.info(str(LM_params))
        have_written_params_to_file = True

    clf = set_up_classifier(method, CV_, LM_params)

    if CV_ < 1 and method != 'svm':
        clf = OneVsRestClassifier(clf)

    # fit and predict based on whether cross validation is used
    if (CV_ > 1):
        step_elim = (1 - fraction_feat_to_keep) / CV_
        rfecv = RFE(estimator=clf,
                    step=step_elim,
                    n_features_to_select=int(fraction_feat_to_keep *
                                             len(list(train_data))))
        rfecv.fit(train_data, train_class)
        preds = rfecv.predict(test_data)
    else:
        clf.fit(train_data, train_class)
        preds = clf.predict(test_data)

    return preds
Esempio n. 3
0
def rfe_classifier(method,
                   train_data,
                   train_class,
                   test_data,
                   CV_=3,
                   fraction_feat_to_keep=0.1,
                   LM_params=get_ML_parameters()):
    global have_written_params_to_file
    if have_written_params_to_file is False:
        logging.info("Run settings for models:")
        logging.info(str(LM_params))
        have_written_params_to_file = True

    clf = set_up_classifier(method, CV_, LM_params)

    # fit and predict based on whether cross validation is used
    if (CV_ > 1):
        step_elim = (1 - fraction_feat_to_keep) / CV_
        num_to_keep = int(fraction_feat_to_keep * len(list(train_data)))
        num_to_keep = max(num_to_keep, 1)
        rfecv = RFE(estimator=clf,
                    step=step_elim,
                    n_features_to_select=num_to_keep)

        rfecv.fit(train_data, train_class)
        preds = rfecv.predict(test_data)
        mask = list(rfecv.support_)
        # print("Number of features selected:", sum(mask))
        #print(rfecv.ranking_)
        features = train_data.columns
        features_selected = [
            features[i] for i in range(0, len(mask)) if mask[i]
        ]
        #print(features_selected)

    else:
        clf.fit(train_data, train_class)
        preds = clf.predict(test_data)

    return preds, features_selected, sum(mask)
def main(work_dir=None, model='rf', set_of_classes=(0, 1, 2, 3)):
    while work_dir is None or Path(work_dir).exists() is False:
        print("Unable to locate directory.")
        work_dir = input("Please enter working directory: ")

    # folder with features split by section
    work_dir = Path(work_dir)
    DATA_DIR = work_dir/'section_fm'
    GOLD_FILE = work_dir/'GOLD_multiclass.csv'




    ML_param_file = work_dir / 'data' / 'ML_model_settings' / 'ML_default_settings.json'
    if ML_param_file.exists():
        params = get_ML_parameters(use_default=False, dict_path=ML_param_file)
    else:
        params = get_ML_parameters(use_default=True)


    logging.info("Loading Data from: " + str(DATA_DIR))

    pathlist = Path(DATA_DIR).glob('*.csv')

    fm_by_section = {}
    lionc = []
    sections_writen = defaultdict(bool) # default = false

    for path in pathlist:
        section_name = path.stem
        lionc.append(section_name)
        fm_by_section[section_name] = pd.read_csv(path,index_col=0)
        fm_by_section[section_name].fillna(0, inplace=True)

    if len(lionc) < 1:
        logging.error("No files found at: " + str(DATA_DIR))
        exit()


    # load gold
    gold = pd.read_csv(GOLD_FILE,index_col=0)
    gold.fillna(0, inplace=True)
    tasks = [x for x in gold if x not in ['test','train']]

    frac_features_for_running_f1 = 0.01


    #set the following to use either RFECV or RFE
    run_f1_with_rfecv = True
    logging.info("frac_features_for_running_f1: " + str(frac_features_for_running_f1) + " with CV?: " + str(run_f1_with_rfecv))
    no_feature_elim = False  # if run_f1_with_rfecv == False can try to run without Feature elim

    logging.info("list of sections found:")
    logging.info(str(lionc))
    logging.info("model to run: " + str(model))

    rfecv_top_features = {}
    NUM_SECT_TO_COMBINE = len(lionc)  # add all sections
    sect_combinations = combinations(lionc,NUM_SECT_TO_COMBINE)

    for combo in sect_combinations:

        section_list = []
        for section in combo:
            section_list.append(fm_by_section[section])

        merged = combine_list_dfs(section_list)

        merged = normalize_df_columns(merged,0,tf=(lambda x: x ** (1/3)))

        train, test, features = rearrange_for_testing(merged, gold)

        p_avg, r_avg, f1_avg, f1_macro_avg = 0, 0, 0, 0

        print("features:", len(features))
        output_label_line = '%s %8s %8s %8s %8s %8s %8s' % ("Morbidity Results", "P-micro", "P-macro", "R-micro", "R-macro", "F1-micro", "F1-macro")
        logging.info(output_label_line)



        for task in tasks:
            train, test, features = rearrange_for_testing(merged, gold, task, set_of_classes)
            # filter features if desired
            features = [f for f in features if len(f)!=2]
            # features = [f for f in features if f[-1] != 'n']

            if run_f1_with_rfecv:
                preds, feat_important,num_feat = rfecv_classifier(model, train_data=train[features], train_class=train[task], test_data=test[features], CV_=3, fraction_feat_to_keep=frac_features_for_running_f1, LM_params=params, save_model=True)
                rfecv_top_features[task] = feat_important
            elif no_feature_elim:
                clf = set_up_classifier(model, 0, LM_params=params)
                clf.fit(train[features], train[task])
                preds = clf.predict(test[features])
            else:
                preds, feat_important,num_feat = rfe_classifier(model, train_data=train[features], train_class=train[task].astype(int), test_data=test[features], CV_=10, fraction_feat_to_keep=frac_features_for_running_f1, LM_params=params)




            results = CalculatePerformance.calculate_metrics(list(test[task]), list(preds), set_of_classes,output_type='values')
            f1 = results[4]
            f1_macro = results[5]

            f1_avg += f1 /len(tasks)
            f1_macro_avg += f1_macro/len(tasks)


            results = CalculatePerformance.calculate_metrics(list(test[task]), list(preds), set_of_classes,output_type='values')
            logging.info("task: " + str(task) + ' ' + CalculatePerformance.calculate_metrics(list(test[task]), list(preds),set_of_classes,output_type='text').strip())





    file_name = work_dir / 'models' / 'top_features.json'
    save_to_json(rfecv_top_features,file_name)

    logging.info("Averages: f1: %.6f, f1_macro: %.6f" % (f1_avg, f1_macro_avg))
Esempio n. 5
0
def rfecv_classifier(method,
                     train_data,
                     train_class,
                     test_data,
                     CV_=3,
                     fraction_feat_to_keep=0.1,
                     LM_params=get_ML_parameters(),
                     save_model=False):
    n_orig_features = len(list(train_data))
    max_ratio_diff = 1.2
    global have_written_params_to_file
    if have_written_params_to_file is False:
        logging.info("Run settings for models:")
        logging.info(str(LM_params))
        have_written_params_to_file = True
    # set classifier method

    clf = set_up_classifier(method, CV_, LM_params)

    # fit and predict based on whether cross validation is used
    if (CV_ > 1):
        step_elim = (1 - fraction_feat_to_keep) / CV_

        # Recursive feature elimination with Cross Validation
        # CV might have issues if data set classification is poorly balanced and can not split it properly
        try:
            rfecv = RFECV(estimator=clf,
                          step=step_elim,
                          cv=StratifiedKFold(n_splits=CV_, random_state=0),
                          scoring='accuracy')
            rfecv.fit(train_data, train_class)
            preds = rfecv.predict(test_data)

            current_fraction_features = rfecv.n_features_ / n_orig_features
            if (current_fraction_features * max_ratio_diff <
                    fraction_feat_to_keep):
                raise ValueError(
                    "Not enough features kept by RFECV defaulting to RFE")
        except ValueError:
            rfecv = RFE(estimator=clf,
                        step=step_elim,
                        n_features_to_select=int(fraction_feat_to_keep *
                                                 len(list(train_data))))
            rfecv.fit(train_data, train_class)
            preds = rfecv.predict(test_data)

        mask = list(rfecv.support_)
        features = train_data.columns
        features_selected = [
            features[i] for i in range(0, len(mask)) if mask[i]
        ]

        # sometimes RFECV does not eliminate enough features, so then lets run RFE to remove more if more than 20% over
        current_fraction_features = len(features_selected) / n_orig_features
        step_elim = (current_fraction_features - fraction_feat_to_keep) / CV_
        if (current_fraction_features >
                max_ratio_diff * fraction_feat_to_keep) and step_elim > 0:
            rfecv = RFE(estimator=clf,
                        step=step_elim,
                        n_features_to_select=int(fraction_feat_to_keep *
                                                 n_orig_features))
            rfecv.fit(train_data[features_selected], train_class)
            preds = rfecv.predict(test_data[features_selected])
            mask = list(rfecv.support_)
            features = train_data.columns
            features_selected = [
                features[i] for i in range(0, len(mask)) if mask[i]
            ]

    else:
        clf.fit(train_data, train_class)
        preds = clf.predict(test_data)

    return preds, features_selected, sum(mask)