Beispiel #1
0
def DBN(X, y, feature_names):
    _, y = change_class_labels(y)
    train_X, val_X, train_y, val_y = train_test_split(X,
                                                      y,
                                                      test_size=0.20,
                                                      random_state=42)

    # train
    lambda1s = [0.005]  # numpy.arange(0.0700,-0.001,-0.001)

    for i in range(len(lambda1s)):
        classifier, training_time = dbn.train_model(train_set_x_org=train_X,
                                                    train_set_y_org=train_y,
                                                    valid_set_x_org=val_X,
                                                    valid_set_y_org=val_y)

        # the scores/ weights
        param0 = classifier.params[0].get_value()

        results = sorted(zip(map(lambda x: round(x, 4), param0),
                             feature_names),
                         reverse=True)

    gc_collect()
    return [x[1] for x in results]
def cross_validate(csv_path, dir_out, random_state=42, normalize=False):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    if normalize:
        df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = ['TSFS']  #['MLP', 'SCA', 'DBN']# ['TSFS']#
    # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set
    per_feats = [0.5 * i for i in range(int(20.0 / 0.5))]

    cv = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=False)
    results = dict()
    fold_idx = 0
    for train_index, test_index in cv.split(X, y):
        print("Train Index: ", train_index, "\n")
        print("Test Index: ", test_index)

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        fold_dict = dict()
        ranked_feats = dict()
        for alg_name in fs_alg_names:
            new_vals = dfs(alg_name, X_train, y_train, feature_names)
            for key in new_vals.keys():  # join two dictionary
                print('alg_name1111: ', key)
                ranked_feats[key] = new_vals[key]

        # now evaluate the results according to the number of selected features
        for alg_name in ranked_feats.keys():
            print('alg_name2222: ', alg_name)
            alg_dict = dict()
            no_feats = len(ranked_feats[alg_name])
            for per_feat in per_feats:
                selected_feats = ranked_feats[
                    alg_name] if per_feat == 0 else ranked_feats[
                        alg_name][:int(per_feat * no_feats / 100)]
                scores = all_clf_evaluator(X_train, X_test, y_train, y_test,
                                           feature_names, selected_feats)
                alg_dict[per_feat] = scores

            fold_dict[alg_name] = alg_dict

        results[fold_idx] = fold_dict
        fold_idx += 1  # update the fold index

        print('finish calculating results')
        print('Start writing results to ', dir_out, ' ....')

    write_results(results, dir_out)
    print('finish writing result')
Beispiel #3
0
def MLP(X, y, feature_names):
    _, y = change_class_labels(y)
    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.20, random_state=42)

    lambda1s = [0.01]

    for i in range(len(lambda1s)):
        classifier, training_time = mlp.train_model(train_set_x_org=train_X,
                                                     train_set_y_org=train_y,
                                                     valid_set_x_org=val_X,
                                                     valid_set_y_org=val_y
                                                    )


        # the scores/ weights
        param0 = classifier.params[0].get_value()

        # get the list of (feat_weight, feat_name) in descending order according to the feat_weight.
        results = sorted(zip(map(lambda x: round(x, 4), param0), feature_names), reverse=True)

    gc_collect()
    return [x[1] for x in results]
Beispiel #4
0
def cross_validate(csv_path, dir_out, file_name, random_state=42):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = ['MLP']  #['TSFS','MLP']# ['TSFS']#
    # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set
    # per_feats = [i * 1.0 for i in range(20)]
    cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False)
    fold_idx = 0
    for train_index, test_index in cv.split(X, y):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        train_df_X = pd.DataFrame(X_train, columns=feature_names)
        train_df_y = pd.DataFrame(y_train, columns=['class'])
        test_df_X = pd.DataFrame(X_test, columns=feature_names)
        test_df_y = pd.DataFrame(y_test, columns=['class'])
        train_df = pd.concat([train_df_X, train_df_y], axis=1)
        test_df = pd.concat([test_df_X, test_df_y], axis=1)
        train_fold_path = dir_out + file_name + '_train_' + str(
            fold_idx) + '.csv'
        test_fold_path = dir_out + file_name + '_test_' + str(
            fold_idx) + '.csv'
        train_df.to_csv(train_fold_path, index=False)
        test_df.to_csv(test_fold_path, index=False)
        fold_idx += 1

    print('finish writing result')
Beispiel #5
0
by concatnating the rows of the original sample.

[2]. A txt file including the class labels. 
Each row is a string (white space not allowed) as the class label of the corresponding row in [1].

[3]. A txt file including the name of features.
Each row is a string (white space not allowed) as the feature name of the corresponding column in [1].
"""

data_dir = "/home/yifeng/YifengLi/Research/deep/extended_deep/v1_0/data/"
# train set
filename = data_dir + "GM12878_200bp_Data_3Cl_l2normalized_TrainSet.txt"
train_set_x_org = numpy.loadtxt(filename, delimiter='\t', dtype='float32')
filename = data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_TrainSet.txt"
train_set_y_org = numpy.loadtxt(filename, delimiter='\t', dtype=object)
prev, train_set_y_org = cl.change_class_labels(train_set_y_org)
# valid set
filename = data_dir + "GM12878_200bp_Data_3Cl_l2normalized_ValidSet.txt"
valid_set_x_org = numpy.loadtxt(filename, delimiter='\t', dtype='float32')
filename = data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_ValidSet.txt"
valid_set_y_org = numpy.loadtxt(filename, delimiter='\t', dtype=object)
prev, valid_set_y_org = cl.change_class_labels(valid_set_y_org)
# test set
filename = data_dir + "GM12878_200bp_Data_3Cl_l2normalized_TestSet.txt"
test_set_x_org = numpy.loadtxt(filename, delimiter='\t', dtype='float32')
filename = data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_TestSet.txt"
test_set_y_org = numpy.loadtxt(filename, delimiter='\t', dtype=object)
prev, test_set_y_org = cl.change_class_labels(test_set_y_org)

filename = data_dir + "GM12878_Features_Unique.txt"
features = numpy.loadtxt(filename, delimiter='\t', dtype=object)
def cross_validate(csv_path, dir_out, random_state=42, normalize=False):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    y = pd.DataFrame(y, columns=['class'])
    y['class'] = pd.factorize(y['class'])[0] + 1
    y = y.values
    if normalize:
        df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = [
        'mRMR',
        'lasso',
        'elastic_net',
        'svm_rfe',  # 'boruta',
        'fisher',
        'chi2',
        'cfs',
        'info_gain',
        'relieff'
    ]  #, 'cfs', 'info_gain', 'relieff', 'jackstraw', 'stability_selection'
    num_feats = [
        0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650,
        700, 750, 800, 850, 900, 950, 1000
    ]  # 0 for the whole feature set

    cv = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=False)
    results = dict()
    fold_idx = 0
    for train_index, test_index in cv.split(X, y):
        print("Train Index: ", train_index, "\n")
        print("Test Index: ", test_index)

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        fold_dict = dict()
        ranked_feats = dict()
        for alg_name in fs_alg_names:
            if alg_name == 'mRMR':
                ranked_feats[alg_name] = []
                continue
            new_vals = non_dl_wrapper(alg_name, X_train, y_train,
                                      feature_names)
            for key in new_vals.keys():  # join two dictionary
                print('alg_name1111: ', key)
                ranked_feats[key] = new_vals[key]

        # now evaluate the results according to the number of selected features
        for alg_name in ranked_feats.keys():
            print('alg_name2222: ', alg_name)
            alg_dict = dict()

            for num_feat in num_feats:
                if len(ranked_feats[alg_name]) < num_feat:
                    alg_dict[num_feat] = {}
                    continue
                if alg_name == 'mRMR':
                    selected_feats = mmr_wrapper(X_train, y_train,
                                                 feature_names, num_feat)
                else:
                    selected_feats = ranked_feats[
                        alg_name] if num_feat == 0 else ranked_feats[
                            alg_name][:num_feat]
                scores = all_clf_evaluator(X_train, X_test, y_train, y_test,
                                           feature_names, selected_feats)
                alg_dict[num_feat] = scores

            fold_dict[alg_name] = alg_dict

        results[fold_idx] = fold_dict
        fold_idx += 1  # update the fold index

    print('finish calculating results')
    print('Start writing results to ', dir_out, ' ....')

    write_results(results, dir_out)
    print('finish writing result')
def cross_validate(csv_path, dir_out, random_state=42, normalize=False):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    if normalize:
        df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = ['MLP', 'TSFS']  # ['TSFS']#
    # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set
    per_feats = [i * 1.0 for i in range(20)]
    # per_feats = [0]
    # per_feats = [1.0, 2.0]

    cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False)
    results = dict()
    fold_idx = 0
    fold_returned_dict = dict()
    selected_feats_dict = dict()
    for train_index, test_index in cv.split(X, y):
        print(fold_idx, " Train Index: ", train_index, "\n")
        print(fold_idx, " Test Index: ", test_index)

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        fold_dict = dict()
        ranked_feats = dict()
        for alg_name in fs_alg_names:
            new_vals = dfs(alg_name, X_train, y_train, feature_names)
            for key in new_vals.keys():  # join two dictionary
                print('alg_name1111: ', key)
                ranked_feats[key] = new_vals[key]

        # now evaluate the results according to the number of selected features
        for alg_name in ranked_feats.keys():
            print('alg_name2222: ', alg_name)
            alg_dict = dict()
            no_feats = len(feature_names)
            for per_feat in per_feats:
                selected_feats = feature_names if per_feat == 0 else ranked_feats[
                    alg_name][:int(per_feat * no_feats / 100) + 1]

                if per_feat == per_feats[-1]:
                    if alg_name not in selected_feats_dict:
                        selected_feats_dict[alg_name] = dict()
                    if fold_idx not in selected_feats_dict[alg_name]:
                        selected_feats_dict[alg_name][
                            fold_idx] = selected_feats

                if alg_name not in fold_returned_dict:
                    fold_returned_dict[alg_name] = dict()
                if per_feat not in fold_returned_dict[alg_name]:
                    fold_returned_dict[alg_name][per_feat] = list()
                fold_returned_dict[alg_name][per_feat].append(selected_feats)

                scores = all_clf_evaluator(X_train, X_test, y_train, y_test,
                                           feature_names, selected_feats)
                alg_dict[per_feat] = scores

            fold_dict[alg_name] = alg_dict

        results[fold_idx] = fold_dict
        fold_idx += 1  # update the fold index

    print('finish calculating results')
    print('Start writing results to ', dir_out, ' ....')

    # print('fold return dict: ', fold_returned_dict)
    # Update scability index in the results dict
    every_fold_scores = dict()
    for alg_name in fold_returned_dict.keys():
        if alg_name not in every_fold_scores:
            every_fold_scores[alg_name] = dict()
        for per_feat in fold_returned_dict[alg_name].keys():
            every_fold_scores[alg_name][per_feat] = get_smilarity_scores(
                fold_returned_dict[alg_name][per_feat], len(feature_names))

    # Update retured results, every fold have the same similarity scores
    for fold_idx in results.keys():
        for alg_name in every_fold_scores.keys():
            for per_feat in every_fold_scores[alg_name].keys():
                for key in every_fold_scores[alg_name][per_feat].keys():
                    results[fold_idx][alg_name][per_feat][
                        key] = every_fold_scores[alg_name][per_feat][key]

    write_results(results, dir_out, selected_feats_dict)
    print('finish writing result')
Beispiel #8
0
def cross_validate(csv_path, dir_out, random_state=42, normalize=False):
    df = pd.read_csv(csv_path)
    df['class'] = pd.factorize(df['class'])[0] + 1
    y = df.pop('class').values
    y = pd.DataFrame(y, columns=['class'])
    y['class'] = pd.factorize(y['class'])[0] + 1
    y = y.values
    if normalize:
        df = df.apply(zscore)
    feature_names = np.array(df.columns.values)
    X = df.values
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    _, y = change_class_labels(y)
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)

    fs_alg_names = [
        'relieff', 'lasso', 'svm_rfe', 'elastic_net', 'hsic_lasso'
    ]  #, 'cfs', 'info_gain', 'relieff', 'jackstraw', 'stability_selection'
    # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set
    per_feats = [1.0 * i for i in range(20)]

    cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False)
    results = dict()
    fold_idx = 0
    fold_returned_dict = dict()
    selected_feats_dict = dict()
    for train_index, test_index in cv.split(X, y):
        print(fold_idx, "Train Index: ", train_index, "\n")
        print(fold_idx, "Test Index: ", test_index)

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]
        # save data to file for hsic lasso
        df_x = pd.DataFrame(X_train, columns=feature_names)
        df_y = pd.DataFrame(y_train, columns=['class'])
        fold_df = pd.concat([df_x, df_y], axis=1)
        fold_train_path = dir_out + str(fold_idx) + '.csv'
        fold_df.to_csv(fold_train_path)
        # end save data to file

        fold_dict = dict()
        ranked_feats = dict()
        for alg_name in fs_alg_names:
            if alg_name == 'mRMR' or alg_name == 'hsic_lasso':
                continue
            new_vals = non_dl_wrapper(alg_name, X_train, y_train,
                                      feature_names)
            for key in new_vals.keys():  # join two dictionary
                print('alg_name1111: ', key)
                ranked_feats[key] = new_vals[key]

        # now evaluate the results according to the number of selected features
        for alg_name in fs_alg_names:
            print('alg_name2222: ', alg_name)
            alg_dict = dict()

            no_feats = len(feature_names)
            for per_feat in per_feats:
                num_feat = int(per_feat * no_feats / 100) + 1
                if alg_name == 'mRMR':
                    selected_feats = mmr_wrapper(X_train, y_train,
                                                 feature_names, num_feat)
                elif alg_name == 'hsic_lasso':
                    selected_feats = hsic_sel(fold_train_path, no_feat)
                else:
                    selected_feats = ranked_feats[
                        alg_name] if num_feat == 0 else ranked_feats[
                            alg_name][:num_feat]

                if alg_name not in ['mRMR', 'hsic_lasso']:
                    if per_feat == per_feats[-1]:
                        if alg_name not in selected_feats_dict:
                            selected_feats_dict[alg_name] = dict()
                        if fold_idx not in selected_feats_dict[alg_name]:
                            selected_feats_dict[alg_name][
                                fold_idx] = selected_feats
                else:
                    if alg_name not in selected_feats_dict:
                        selected_feats_dict[alg_name] = dict()
                    if fold_idx not in selected_feats_dict[alg_name]:
                        selected_feats_dict[alg_name][fold_idx] = []
                    selected_feats_dict[alg_name][fold_idx].append(
                        'PER_FEAT_' + str(per_feat))
                    selected_feats_dict[alg_name][fold_idx].extend(
                        selected_feats)
                    print(selected_feats_dict)

                if alg_name not in fold_returned_dict:
                    fold_returned_dict[alg_name] = dict()
                if per_feat not in fold_returned_dict[alg_name]:
                    fold_returned_dict[alg_name][per_feat] = list()
                fold_returned_dict[alg_name][per_feat].append(selected_feats)

                scores = all_clf_evaluator(X_train, X_test, y_train, y_test,
                                           feature_names, selected_feats)
                alg_dict[per_feat] = scores

            fold_dict[alg_name] = alg_dict

        results[fold_idx] = fold_dict
        fold_idx += 1  # update the fold index

    print('finish calculating results')
    print('Start writing results to ', dir_out, ' ....')

    every_fold_scores = dict()
    for alg_name in fold_returned_dict.keys():
        if alg_name not in every_fold_scores:
            every_fold_scores[alg_name] = dict()
        for per_feat in fold_returned_dict[alg_name].keys():
            every_fold_scores[alg_name][per_feat] = get_smilarity_scores(
                fold_returned_dict[alg_name][per_feat], len(feature_names))

    # Update retured results, every fold have the same similarity scores
    for fold_idx in results.keys():
        for alg_name in every_fold_scores.keys():
            for per_feat in every_fold_scores[alg_name].keys():
                for key in every_fold_scores[alg_name][per_feat].keys():
                    results[fold_idx][alg_name][per_feat][
                        key] = every_fold_scores[alg_name][per_feat][key]

    write_results(results, dir_out, selected_feats_dict)
    print('finish writing result')