def DBN(X, y, feature_names): _, y = change_class_labels(y) train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.20, random_state=42) # train lambda1s = [0.005] # numpy.arange(0.0700,-0.001,-0.001) for i in range(len(lambda1s)): classifier, training_time = dbn.train_model(train_set_x_org=train_X, train_set_y_org=train_y, valid_set_x_org=val_X, valid_set_y_org=val_y) # the scores/ weights param0 = classifier.params[0].get_value() results = sorted(zip(map(lambda x: round(x, 4), param0), feature_names), reverse=True) gc_collect() return [x[1] for x in results]
def cross_validate(csv_path, dir_out, random_state=42, normalize=False): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values if normalize: df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = ['TSFS'] #['MLP', 'SCA', 'DBN']# ['TSFS']# # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set per_feats = [0.5 * i for i in range(int(20.0 / 0.5))] cv = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=False) results = dict() fold_idx = 0 for train_index, test_index in cv.split(X, y): print("Train Index: ", train_index, "\n") print("Test Index: ", test_index) X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] fold_dict = dict() ranked_feats = dict() for alg_name in fs_alg_names: new_vals = dfs(alg_name, X_train, y_train, feature_names) for key in new_vals.keys(): # join two dictionary print('alg_name1111: ', key) ranked_feats[key] = new_vals[key] # now evaluate the results according to the number of selected features for alg_name in ranked_feats.keys(): print('alg_name2222: ', alg_name) alg_dict = dict() no_feats = len(ranked_feats[alg_name]) for per_feat in per_feats: selected_feats = ranked_feats[ alg_name] if per_feat == 0 else ranked_feats[ alg_name][:int(per_feat * no_feats / 100)] scores = all_clf_evaluator(X_train, X_test, y_train, y_test, feature_names, selected_feats) alg_dict[per_feat] = scores fold_dict[alg_name] = alg_dict results[fold_idx] = fold_dict fold_idx += 1 # update the fold index print('finish calculating results') print('Start writing results to ', dir_out, ' ....') write_results(results, dir_out) print('finish writing result')
def MLP(X, y, feature_names): _, y = change_class_labels(y) train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.20, random_state=42) lambda1s = [0.01] for i in range(len(lambda1s)): classifier, training_time = mlp.train_model(train_set_x_org=train_X, train_set_y_org=train_y, valid_set_x_org=val_X, valid_set_y_org=val_y ) # the scores/ weights param0 = classifier.params[0].get_value() # get the list of (feat_weight, feat_name) in descending order according to the feat_weight. results = sorted(zip(map(lambda x: round(x, 4), param0), feature_names), reverse=True) gc_collect() return [x[1] for x in results]
def cross_validate(csv_path, dir_out, file_name, random_state=42): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = ['MLP'] #['TSFS','MLP']# ['TSFS']# # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set # per_feats = [i * 1.0 for i in range(20)] cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False) fold_idx = 0 for train_index, test_index in cv.split(X, y): X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] train_df_X = pd.DataFrame(X_train, columns=feature_names) train_df_y = pd.DataFrame(y_train, columns=['class']) test_df_X = pd.DataFrame(X_test, columns=feature_names) test_df_y = pd.DataFrame(y_test, columns=['class']) train_df = pd.concat([train_df_X, train_df_y], axis=1) test_df = pd.concat([test_df_X, test_df_y], axis=1) train_fold_path = dir_out + file_name + '_train_' + str( fold_idx) + '.csv' test_fold_path = dir_out + file_name + '_test_' + str( fold_idx) + '.csv' train_df.to_csv(train_fold_path, index=False) test_df.to_csv(test_fold_path, index=False) fold_idx += 1 print('finish writing result')
by concatnating the rows of the original sample. [2]. A txt file including the class labels. Each row is a string (white space not allowed) as the class label of the corresponding row in [1]. [3]. A txt file including the name of features. Each row is a string (white space not allowed) as the feature name of the corresponding column in [1]. """ data_dir = "/home/yifeng/YifengLi/Research/deep/extended_deep/v1_0/data/" # train set filename = data_dir + "GM12878_200bp_Data_3Cl_l2normalized_TrainSet.txt" train_set_x_org = numpy.loadtxt(filename, delimiter='\t', dtype='float32') filename = data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_TrainSet.txt" train_set_y_org = numpy.loadtxt(filename, delimiter='\t', dtype=object) prev, train_set_y_org = cl.change_class_labels(train_set_y_org) # valid set filename = data_dir + "GM12878_200bp_Data_3Cl_l2normalized_ValidSet.txt" valid_set_x_org = numpy.loadtxt(filename, delimiter='\t', dtype='float32') filename = data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_ValidSet.txt" valid_set_y_org = numpy.loadtxt(filename, delimiter='\t', dtype=object) prev, valid_set_y_org = cl.change_class_labels(valid_set_y_org) # test set filename = data_dir + "GM12878_200bp_Data_3Cl_l2normalized_TestSet.txt" test_set_x_org = numpy.loadtxt(filename, delimiter='\t', dtype='float32') filename = data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_TestSet.txt" test_set_y_org = numpy.loadtxt(filename, delimiter='\t', dtype=object) prev, test_set_y_org = cl.change_class_labels(test_set_y_org) filename = data_dir + "GM12878_Features_Unique.txt" features = numpy.loadtxt(filename, delimiter='\t', dtype=object)
def cross_validate(csv_path, dir_out, random_state=42, normalize=False): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values y = pd.DataFrame(y, columns=['class']) y['class'] = pd.factorize(y['class'])[0] + 1 y = y.values if normalize: df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = [ 'mRMR', 'lasso', 'elastic_net', 'svm_rfe', # 'boruta', 'fisher', 'chi2', 'cfs', 'info_gain', 'relieff' ] #, 'cfs', 'info_gain', 'relieff', 'jackstraw', 'stability_selection' num_feats = [ 0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000 ] # 0 for the whole feature set cv = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=False) results = dict() fold_idx = 0 for train_index, test_index in cv.split(X, y): print("Train Index: ", train_index, "\n") print("Test Index: ", test_index) X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] fold_dict = dict() ranked_feats = dict() for alg_name in fs_alg_names: if alg_name == 'mRMR': ranked_feats[alg_name] = [] continue new_vals = non_dl_wrapper(alg_name, X_train, y_train, feature_names) for key in new_vals.keys(): # join two dictionary print('alg_name1111: ', key) ranked_feats[key] = new_vals[key] # now evaluate the results according to the number of selected features for alg_name in ranked_feats.keys(): print('alg_name2222: ', alg_name) alg_dict = dict() for num_feat in num_feats: if len(ranked_feats[alg_name]) < num_feat: alg_dict[num_feat] = {} continue if alg_name == 'mRMR': selected_feats = mmr_wrapper(X_train, y_train, feature_names, num_feat) else: selected_feats = ranked_feats[ alg_name] if num_feat == 0 else ranked_feats[ alg_name][:num_feat] scores = all_clf_evaluator(X_train, X_test, y_train, y_test, feature_names, selected_feats) alg_dict[num_feat] = scores fold_dict[alg_name] = alg_dict results[fold_idx] = fold_dict fold_idx += 1 # update the fold index print('finish calculating results') print('Start writing results to ', dir_out, ' ....') write_results(results, dir_out) print('finish writing result')
def cross_validate(csv_path, dir_out, random_state=42, normalize=False): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values if normalize: df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = ['MLP', 'TSFS'] # ['TSFS']# # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set per_feats = [i * 1.0 for i in range(20)] # per_feats = [0] # per_feats = [1.0, 2.0] cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False) results = dict() fold_idx = 0 fold_returned_dict = dict() selected_feats_dict = dict() for train_index, test_index in cv.split(X, y): print(fold_idx, " Train Index: ", train_index, "\n") print(fold_idx, " Test Index: ", test_index) X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] fold_dict = dict() ranked_feats = dict() for alg_name in fs_alg_names: new_vals = dfs(alg_name, X_train, y_train, feature_names) for key in new_vals.keys(): # join two dictionary print('alg_name1111: ', key) ranked_feats[key] = new_vals[key] # now evaluate the results according to the number of selected features for alg_name in ranked_feats.keys(): print('alg_name2222: ', alg_name) alg_dict = dict() no_feats = len(feature_names) for per_feat in per_feats: selected_feats = feature_names if per_feat == 0 else ranked_feats[ alg_name][:int(per_feat * no_feats / 100) + 1] if per_feat == per_feats[-1]: if alg_name not in selected_feats_dict: selected_feats_dict[alg_name] = dict() if fold_idx not in selected_feats_dict[alg_name]: selected_feats_dict[alg_name][ fold_idx] = selected_feats if alg_name not in fold_returned_dict: fold_returned_dict[alg_name] = dict() if per_feat not in fold_returned_dict[alg_name]: fold_returned_dict[alg_name][per_feat] = list() fold_returned_dict[alg_name][per_feat].append(selected_feats) scores = all_clf_evaluator(X_train, X_test, y_train, y_test, feature_names, selected_feats) alg_dict[per_feat] = scores fold_dict[alg_name] = alg_dict results[fold_idx] = fold_dict fold_idx += 1 # update the fold index print('finish calculating results') print('Start writing results to ', dir_out, ' ....') # print('fold return dict: ', fold_returned_dict) # Update scability index in the results dict every_fold_scores = dict() for alg_name in fold_returned_dict.keys(): if alg_name not in every_fold_scores: every_fold_scores[alg_name] = dict() for per_feat in fold_returned_dict[alg_name].keys(): every_fold_scores[alg_name][per_feat] = get_smilarity_scores( fold_returned_dict[alg_name][per_feat], len(feature_names)) # Update retured results, every fold have the same similarity scores for fold_idx in results.keys(): for alg_name in every_fold_scores.keys(): for per_feat in every_fold_scores[alg_name].keys(): for key in every_fold_scores[alg_name][per_feat].keys(): results[fold_idx][alg_name][per_feat][ key] = every_fold_scores[alg_name][per_feat][key] write_results(results, dir_out, selected_feats_dict) print('finish writing result')
def cross_validate(csv_path, dir_out, random_state=42, normalize=False): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values y = pd.DataFrame(y, columns=['class']) y['class'] = pd.factorize(y['class'])[0] + 1 y = y.values if normalize: df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = [ 'relieff', 'lasso', 'svm_rfe', 'elastic_net', 'hsic_lasso' ] #, 'cfs', 'info_gain', 'relieff', 'jackstraw', 'stability_selection' # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set per_feats = [1.0 * i for i in range(20)] cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False) results = dict() fold_idx = 0 fold_returned_dict = dict() selected_feats_dict = dict() for train_index, test_index in cv.split(X, y): print(fold_idx, "Train Index: ", train_index, "\n") print(fold_idx, "Test Index: ", test_index) X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] # save data to file for hsic lasso df_x = pd.DataFrame(X_train, columns=feature_names) df_y = pd.DataFrame(y_train, columns=['class']) fold_df = pd.concat([df_x, df_y], axis=1) fold_train_path = dir_out + str(fold_idx) + '.csv' fold_df.to_csv(fold_train_path) # end save data to file fold_dict = dict() ranked_feats = dict() for alg_name in fs_alg_names: if alg_name == 'mRMR' or alg_name == 'hsic_lasso': continue new_vals = non_dl_wrapper(alg_name, X_train, y_train, feature_names) for key in new_vals.keys(): # join two dictionary print('alg_name1111: ', key) ranked_feats[key] = new_vals[key] # now evaluate the results according to the number of selected features for alg_name in fs_alg_names: print('alg_name2222: ', alg_name) alg_dict = dict() no_feats = len(feature_names) for per_feat in per_feats: num_feat = int(per_feat * no_feats / 100) + 1 if alg_name == 'mRMR': selected_feats = mmr_wrapper(X_train, y_train, feature_names, num_feat) elif alg_name == 'hsic_lasso': selected_feats = hsic_sel(fold_train_path, no_feat) else: selected_feats = ranked_feats[ alg_name] if num_feat == 0 else ranked_feats[ alg_name][:num_feat] if alg_name not in ['mRMR', 'hsic_lasso']: if per_feat == per_feats[-1]: if alg_name not in selected_feats_dict: selected_feats_dict[alg_name] = dict() if fold_idx not in selected_feats_dict[alg_name]: selected_feats_dict[alg_name][ fold_idx] = selected_feats else: if alg_name not in selected_feats_dict: selected_feats_dict[alg_name] = dict() if fold_idx not in selected_feats_dict[alg_name]: selected_feats_dict[alg_name][fold_idx] = [] selected_feats_dict[alg_name][fold_idx].append( 'PER_FEAT_' + str(per_feat)) selected_feats_dict[alg_name][fold_idx].extend( selected_feats) print(selected_feats_dict) if alg_name not in fold_returned_dict: fold_returned_dict[alg_name] = dict() if per_feat not in fold_returned_dict[alg_name]: fold_returned_dict[alg_name][per_feat] = list() fold_returned_dict[alg_name][per_feat].append(selected_feats) scores = all_clf_evaluator(X_train, X_test, y_train, y_test, feature_names, selected_feats) alg_dict[per_feat] = scores fold_dict[alg_name] = alg_dict results[fold_idx] = fold_dict fold_idx += 1 # update the fold index print('finish calculating results') print('Start writing results to ', dir_out, ' ....') every_fold_scores = dict() for alg_name in fold_returned_dict.keys(): if alg_name not in every_fold_scores: every_fold_scores[alg_name] = dict() for per_feat in fold_returned_dict[alg_name].keys(): every_fold_scores[alg_name][per_feat] = get_smilarity_scores( fold_returned_dict[alg_name][per_feat], len(feature_names)) # Update retured results, every fold have the same similarity scores for fold_idx in results.keys(): for alg_name in every_fold_scores.keys(): for per_feat in every_fold_scores[alg_name].keys(): for key in every_fold_scores[alg_name][per_feat].keys(): results[fold_idx][alg_name][per_feat][ key] = every_fold_scores[alg_name][per_feat][key] write_results(results, dir_out, selected_feats_dict) print('finish writing result')