def cross_validate(csv_path, dir_out, random_state=42, normalize=False):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    if normalize:
        df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = ['TSFS']  #['MLP', 'SCA', 'DBN']# ['TSFS']#
    # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set
    per_feats = [0.5 * i for i in range(int(20.0 / 0.5))]

    cv = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=False)
    results = dict()
    fold_idx = 0
    for train_index, test_index in cv.split(X, y):
        print("Train Index: ", train_index, "\n")
        print("Test Index: ", test_index)

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        fold_dict = dict()
        ranked_feats = dict()
        for alg_name in fs_alg_names:
            new_vals = dfs(alg_name, X_train, y_train, feature_names)
            for key in new_vals.keys():  # join two dictionary
                print('alg_name1111: ', key)
                ranked_feats[key] = new_vals[key]

        # now evaluate the results according to the number of selected features
        for alg_name in ranked_feats.keys():
            print('alg_name2222: ', alg_name)
            alg_dict = dict()
            no_feats = len(ranked_feats[alg_name])
            for per_feat in per_feats:
                selected_feats = ranked_feats[
                    alg_name] if per_feat == 0 else ranked_feats[
                        alg_name][:int(per_feat * no_feats / 100)]
                scores = all_clf_evaluator(X_train, X_test, y_train, y_test,
                                           feature_names, selected_feats)
                alg_dict[per_feat] = scores

            fold_dict[alg_name] = alg_dict

        results[fold_idx] = fold_dict
        fold_idx += 1  # update the fold index

        print('finish calculating results')
        print('Start writing results to ', dir_out, ' ....')

    write_results(results, dir_out)
    print('finish writing result')
def cross_validate(csv_path, dir_out, random_state=42, normalize=False):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    y = pd.DataFrame(y, columns=['class'])
    y['class'] = pd.factorize(y['class'])[0] + 1
    y = y.values
    if normalize:
        df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = [
        'mRMR',
        'lasso',
        'elastic_net',
        'svm_rfe',  # 'boruta',
        'fisher',
        'chi2',
        'cfs',
        'info_gain',
        'relieff'
    ]  #, 'cfs', 'info_gain', 'relieff', 'jackstraw', 'stability_selection'
    num_feats = [
        0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650,
        700, 750, 800, 850, 900, 950, 1000
    ]  # 0 for the whole feature set

    cv = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=False)
    results = dict()
    fold_idx = 0
    for train_index, test_index in cv.split(X, y):
        print("Train Index: ", train_index, "\n")
        print("Test Index: ", test_index)

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        fold_dict = dict()
        ranked_feats = dict()
        for alg_name in fs_alg_names:
            if alg_name == 'mRMR':
                ranked_feats[alg_name] = []
                continue
            new_vals = non_dl_wrapper(alg_name, X_train, y_train,
                                      feature_names)
            for key in new_vals.keys():  # join two dictionary
                print('alg_name1111: ', key)
                ranked_feats[key] = new_vals[key]

        # now evaluate the results according to the number of selected features
        for alg_name in ranked_feats.keys():
            print('alg_name2222: ', alg_name)
            alg_dict = dict()

            for num_feat in num_feats:
                if len(ranked_feats[alg_name]) < num_feat:
                    alg_dict[num_feat] = {}
                    continue
                if alg_name == 'mRMR':
                    selected_feats = mmr_wrapper(X_train, y_train,
                                                 feature_names, num_feat)
                else:
                    selected_feats = ranked_feats[
                        alg_name] if num_feat == 0 else ranked_feats[
                            alg_name][:num_feat]
                scores = all_clf_evaluator(X_train, X_test, y_train, y_test,
                                           feature_names, selected_feats)
                alg_dict[num_feat] = scores

            fold_dict[alg_name] = alg_dict

        results[fold_idx] = fold_dict
        fold_idx += 1  # update the fold index

    print('finish calculating results')
    print('Start writing results to ', dir_out, ' ....')

    write_results(results, dir_out)
    print('finish writing result')
def cross_validate(csv_path, dir_out, random_state=42, normalize=False):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    if normalize:
        df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = ['MLP', 'TSFS']  # ['TSFS']#
    # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set
    per_feats = [i * 1.0 for i in range(20)]
    # per_feats = [0]
    # per_feats = [1.0, 2.0]

    cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False)
    results = dict()
    fold_idx = 0
    fold_returned_dict = dict()
    selected_feats_dict = dict()
    for train_index, test_index in cv.split(X, y):
        print(fold_idx, " Train Index: ", train_index, "\n")
        print(fold_idx, " Test Index: ", test_index)

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        fold_dict = dict()
        ranked_feats = dict()
        for alg_name in fs_alg_names:
            new_vals = dfs(alg_name, X_train, y_train, feature_names)
            for key in new_vals.keys():  # join two dictionary
                print('alg_name1111: ', key)
                ranked_feats[key] = new_vals[key]

        # now evaluate the results according to the number of selected features
        for alg_name in ranked_feats.keys():
            print('alg_name2222: ', alg_name)
            alg_dict = dict()
            no_feats = len(feature_names)
            for per_feat in per_feats:
                selected_feats = feature_names if per_feat == 0 else ranked_feats[
                    alg_name][:int(per_feat * no_feats / 100) + 1]

                if per_feat == per_feats[-1]:
                    if alg_name not in selected_feats_dict:
                        selected_feats_dict[alg_name] = dict()
                    if fold_idx not in selected_feats_dict[alg_name]:
                        selected_feats_dict[alg_name][
                            fold_idx] = selected_feats

                if alg_name not in fold_returned_dict:
                    fold_returned_dict[alg_name] = dict()
                if per_feat not in fold_returned_dict[alg_name]:
                    fold_returned_dict[alg_name][per_feat] = list()
                fold_returned_dict[alg_name][per_feat].append(selected_feats)

                scores = all_clf_evaluator(X_train, X_test, y_train, y_test,
                                           feature_names, selected_feats)
                alg_dict[per_feat] = scores

            fold_dict[alg_name] = alg_dict

        results[fold_idx] = fold_dict
        fold_idx += 1  # update the fold index

    print('finish calculating results')
    print('Start writing results to ', dir_out, ' ....')

    # print('fold return dict: ', fold_returned_dict)
    # Update scability index in the results dict
    every_fold_scores = dict()
    for alg_name in fold_returned_dict.keys():
        if alg_name not in every_fold_scores:
            every_fold_scores[alg_name] = dict()
        for per_feat in fold_returned_dict[alg_name].keys():
            every_fold_scores[alg_name][per_feat] = get_smilarity_scores(
                fold_returned_dict[alg_name][per_feat], len(feature_names))

    # Update retured results, every fold have the same similarity scores
    for fold_idx in results.keys():
        for alg_name in every_fold_scores.keys():
            for per_feat in every_fold_scores[alg_name].keys():
                for key in every_fold_scores[alg_name][per_feat].keys():
                    results[fold_idx][alg_name][per_feat][
                        key] = every_fold_scores[alg_name][per_feat][key]

    write_results(results, dir_out, selected_feats_dict)
    print('finish writing result')
Beispiel #4
0
def cross_validate(csv_path, dir_out, random_state=42, normalize=False):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    y = pd.DataFrame(y, columns=['class'])
    y['class'] = pd.factorize(y['class'])[0] + 1
    y = y.values
    if normalize:
        df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = [
        'relieff', 'lasso', 'svm_rfe', 'elastic_net', 'hsic_lasso'
    ]  #, 'cfs', 'info_gain', 'relieff', 'jackstraw', 'stability_selection'
    # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set
    per_feats = [1.0 * i for i in range(20)]

    cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False)
    results = dict()
    fold_idx = 0
    fold_returned_dict = dict()
    selected_feats_dict = dict()
    for train_index, test_index in cv.split(X, y):
        print(fold_idx, "Train Index: ", train_index, "\n")
        print(fold_idx, "Test Index: ", test_index)

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        # save data to file for hsic lasso
        df_x = pd.DataFrame(X_train, columns=feature_names)
        df_y = pd.DataFrame(y_train, columns=['class'])
        fold_df = pd.concat([df_x, df_y], axis=1)
        fold_train_path = dir_out + str(fold_idx) + '.csv'
        fold_df.to_csv(fold_train_path)
        # end save data to file

        fold_dict = dict()
        ranked_feats = dict()
        for alg_name in fs_alg_names:
            if alg_name == 'mRMR' or alg_name == 'hsic_lasso':
                continue
            new_vals = non_dl_wrapper(alg_name, X_train, y_train,
                                      feature_names)
            for key in new_vals.keys():  # join two dictionary
                print('alg_name1111: ', key)
                ranked_feats[key] = new_vals[key]

        # now evaluate the results according to the number of selected features
        for alg_name in fs_alg_names:
            print('alg_name2222: ', alg_name)
            alg_dict = dict()

            no_feats = len(feature_names)
            for per_feat in per_feats:
                num_feat = int(per_feat * no_feats / 100) + 1
                if alg_name == 'mRMR':
                    selected_feats = mmr_wrapper(X_train, y_train,
                                                 feature_names, num_feat)
                elif alg_name == 'hsic_lasso':
                    selected_feats = hsic_sel(fold_train_path, no_feat)
                else:
                    selected_feats = ranked_feats[
                        alg_name] if num_feat == 0 else ranked_feats[
                            alg_name][:num_feat]

                if alg_name not in ['mRMR', 'hsic_lasso']:
                    if per_feat == per_feats[-1]:
                        if alg_name not in selected_feats_dict:
                            selected_feats_dict[alg_name] = dict()
                        if fold_idx not in selected_feats_dict[alg_name]:
                            selected_feats_dict[alg_name][
                                fold_idx] = selected_feats
                else:
                    if alg_name not in selected_feats_dict:
                        selected_feats_dict[alg_name] = dict()
                    if fold_idx not in selected_feats_dict[alg_name]:
                        selected_feats_dict[alg_name][fold_idx] = []
                    selected_feats_dict[alg_name][fold_idx].append(
                        'PER_FEAT_' + str(per_feat))
                    selected_feats_dict[alg_name][fold_idx].extend(
                        selected_feats)
                    print(selected_feats_dict)

                if alg_name not in fold_returned_dict:
                    fold_returned_dict[alg_name] = dict()
                if per_feat not in fold_returned_dict[alg_name]:
                    fold_returned_dict[alg_name][per_feat] = list()
                fold_returned_dict[alg_name][per_feat].append(selected_feats)

                scores = all_clf_evaluator(X_train, X_test, y_train, y_test,
                                           feature_names, selected_feats)
                alg_dict[per_feat] = scores

            fold_dict[alg_name] = alg_dict

        results[fold_idx] = fold_dict
        fold_idx += 1  # update the fold index

    print('finish calculating results')
    print('Start writing results to ', dir_out, ' ....')

    every_fold_scores = dict()
    for alg_name in fold_returned_dict.keys():
        if alg_name not in every_fold_scores:
            every_fold_scores[alg_name] = dict()
        for per_feat in fold_returned_dict[alg_name].keys():
            every_fold_scores[alg_name][per_feat] = get_smilarity_scores(
                fold_returned_dict[alg_name][per_feat], len(feature_names))

    # Update retured results, every fold have the same similarity scores
    for fold_idx in results.keys():
        for alg_name in every_fold_scores.keys():
            for per_feat in every_fold_scores[alg_name].keys():
                for key in every_fold_scores[alg_name][per_feat].keys():
                    results[fold_idx][alg_name][per_feat][
                        key] = every_fold_scores[alg_name][per_feat][key]

    write_results(results, dir_out, selected_feats_dict)
    print('finish writing result')