def cross_validate(csv_path, dir_out, random_state=42, normalize=False): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values if normalize: df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = ['TSFS'] #['MLP', 'SCA', 'DBN']# ['TSFS']# # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set per_feats = [0.5 * i for i in range(int(20.0 / 0.5))] cv = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=False) results = dict() fold_idx = 0 for train_index, test_index in cv.split(X, y): print("Train Index: ", train_index, "\n") print("Test Index: ", test_index) X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] fold_dict = dict() ranked_feats = dict() for alg_name in fs_alg_names: new_vals = dfs(alg_name, X_train, y_train, feature_names) for key in new_vals.keys(): # join two dictionary print('alg_name1111: ', key) ranked_feats[key] = new_vals[key] # now evaluate the results according to the number of selected features for alg_name in ranked_feats.keys(): print('alg_name2222: ', alg_name) alg_dict = dict() no_feats = len(ranked_feats[alg_name]) for per_feat in per_feats: selected_feats = ranked_feats[ alg_name] if per_feat == 0 else ranked_feats[ alg_name][:int(per_feat * no_feats / 100)] scores = all_clf_evaluator(X_train, X_test, y_train, y_test, feature_names, selected_feats) alg_dict[per_feat] = scores fold_dict[alg_name] = alg_dict results[fold_idx] = fold_dict fold_idx += 1 # update the fold index print('finish calculating results') print('Start writing results to ', dir_out, ' ....') write_results(results, dir_out) print('finish writing result')
def cross_validate(csv_path, dir_out, random_state=42, normalize=False): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values y = pd.DataFrame(y, columns=['class']) y['class'] = pd.factorize(y['class'])[0] + 1 y = y.values if normalize: df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = [ 'mRMR', 'lasso', 'elastic_net', 'svm_rfe', # 'boruta', 'fisher', 'chi2', 'cfs', 'info_gain', 'relieff' ] #, 'cfs', 'info_gain', 'relieff', 'jackstraw', 'stability_selection' num_feats = [ 0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000 ] # 0 for the whole feature set cv = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=False) results = dict() fold_idx = 0 for train_index, test_index in cv.split(X, y): print("Train Index: ", train_index, "\n") print("Test Index: ", test_index) X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] fold_dict = dict() ranked_feats = dict() for alg_name in fs_alg_names: if alg_name == 'mRMR': ranked_feats[alg_name] = [] continue new_vals = non_dl_wrapper(alg_name, X_train, y_train, feature_names) for key in new_vals.keys(): # join two dictionary print('alg_name1111: ', key) ranked_feats[key] = new_vals[key] # now evaluate the results according to the number of selected features for alg_name in ranked_feats.keys(): print('alg_name2222: ', alg_name) alg_dict = dict() for num_feat in num_feats: if len(ranked_feats[alg_name]) < num_feat: alg_dict[num_feat] = {} continue if alg_name == 'mRMR': selected_feats = mmr_wrapper(X_train, y_train, feature_names, num_feat) else: selected_feats = ranked_feats[ alg_name] if num_feat == 0 else ranked_feats[ alg_name][:num_feat] scores = all_clf_evaluator(X_train, X_test, y_train, y_test, feature_names, selected_feats) alg_dict[num_feat] = scores fold_dict[alg_name] = alg_dict results[fold_idx] = fold_dict fold_idx += 1 # update the fold index print('finish calculating results') print('Start writing results to ', dir_out, ' ....') write_results(results, dir_out) print('finish writing result')
def cross_validate(csv_path, dir_out, random_state=42, normalize=False): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values if normalize: df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = ['MLP', 'TSFS'] # ['TSFS']# # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set per_feats = [i * 1.0 for i in range(20)] # per_feats = [0] # per_feats = [1.0, 2.0] cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False) results = dict() fold_idx = 0 fold_returned_dict = dict() selected_feats_dict = dict() for train_index, test_index in cv.split(X, y): print(fold_idx, " Train Index: ", train_index, "\n") print(fold_idx, " Test Index: ", test_index) X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] fold_dict = dict() ranked_feats = dict() for alg_name in fs_alg_names: new_vals = dfs(alg_name, X_train, y_train, feature_names) for key in new_vals.keys(): # join two dictionary print('alg_name1111: ', key) ranked_feats[key] = new_vals[key] # now evaluate the results according to the number of selected features for alg_name in ranked_feats.keys(): print('alg_name2222: ', alg_name) alg_dict = dict() no_feats = len(feature_names) for per_feat in per_feats: selected_feats = feature_names if per_feat == 0 else ranked_feats[ alg_name][:int(per_feat * no_feats / 100) + 1] if per_feat == per_feats[-1]: if alg_name not in selected_feats_dict: selected_feats_dict[alg_name] = dict() if fold_idx not in selected_feats_dict[alg_name]: selected_feats_dict[alg_name][ fold_idx] = selected_feats if alg_name not in fold_returned_dict: fold_returned_dict[alg_name] = dict() if per_feat not in fold_returned_dict[alg_name]: fold_returned_dict[alg_name][per_feat] = list() fold_returned_dict[alg_name][per_feat].append(selected_feats) scores = all_clf_evaluator(X_train, X_test, y_train, y_test, feature_names, selected_feats) alg_dict[per_feat] = scores fold_dict[alg_name] = alg_dict results[fold_idx] = fold_dict fold_idx += 1 # update the fold index print('finish calculating results') print('Start writing results to ', dir_out, ' ....') # print('fold return dict: ', fold_returned_dict) # Update scability index in the results dict every_fold_scores = dict() for alg_name in fold_returned_dict.keys(): if alg_name not in every_fold_scores: every_fold_scores[alg_name] = dict() for per_feat in fold_returned_dict[alg_name].keys(): every_fold_scores[alg_name][per_feat] = get_smilarity_scores( fold_returned_dict[alg_name][per_feat], len(feature_names)) # Update retured results, every fold have the same similarity scores for fold_idx in results.keys(): for alg_name in every_fold_scores.keys(): for per_feat in every_fold_scores[alg_name].keys(): for key in every_fold_scores[alg_name][per_feat].keys(): results[fold_idx][alg_name][per_feat][ key] = every_fold_scores[alg_name][per_feat][key] write_results(results, dir_out, selected_feats_dict) print('finish writing result')
def cross_validate(csv_path, dir_out, random_state=42, normalize=False): df = pd.read_csv(csv_path) df['class'] = pd.factorize(df['class'])[0] + 1 y = df.pop('class').values y = pd.DataFrame(y, columns=['class']) y['class'] = pd.factorize(y['class'])[0] + 1 y = y.values if normalize: df = df.apply(zscore) feature_names = np.array(df.columns.values) X = df.values print('X.shape: ', X.shape) print('y.shape: ', y.shape) _, y = change_class_labels(y) print('X.shape: ', X.shape) print('y.shape: ', y.shape) fs_alg_names = [ 'relieff', 'lasso', 'svm_rfe', 'elastic_net', 'hsic_lasso' ] #, 'cfs', 'info_gain', 'relieff', 'jackstraw', 'stability_selection' # num_feats = [0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000] # 0 for the whole feature set per_feats = [1.0 * i for i in range(20)] cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=False) results = dict() fold_idx = 0 fold_returned_dict = dict() selected_feats_dict = dict() for train_index, test_index in cv.split(X, y): print(fold_idx, "Train Index: ", train_index, "\n") print(fold_idx, "Test Index: ", test_index) X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] # save data to file for hsic lasso df_x = pd.DataFrame(X_train, columns=feature_names) df_y = pd.DataFrame(y_train, columns=['class']) fold_df = pd.concat([df_x, df_y], axis=1) fold_train_path = dir_out + str(fold_idx) + '.csv' fold_df.to_csv(fold_train_path) # end save data to file fold_dict = dict() ranked_feats = dict() for alg_name in fs_alg_names: if alg_name == 'mRMR' or alg_name == 'hsic_lasso': continue new_vals = non_dl_wrapper(alg_name, X_train, y_train, feature_names) for key in new_vals.keys(): # join two dictionary print('alg_name1111: ', key) ranked_feats[key] = new_vals[key] # now evaluate the results according to the number of selected features for alg_name in fs_alg_names: print('alg_name2222: ', alg_name) alg_dict = dict() no_feats = len(feature_names) for per_feat in per_feats: num_feat = int(per_feat * no_feats / 100) + 1 if alg_name == 'mRMR': selected_feats = mmr_wrapper(X_train, y_train, feature_names, num_feat) elif alg_name == 'hsic_lasso': selected_feats = hsic_sel(fold_train_path, no_feat) else: selected_feats = ranked_feats[ alg_name] if num_feat == 0 else ranked_feats[ alg_name][:num_feat] if alg_name not in ['mRMR', 'hsic_lasso']: if per_feat == per_feats[-1]: if alg_name not in selected_feats_dict: selected_feats_dict[alg_name] = dict() if fold_idx not in selected_feats_dict[alg_name]: selected_feats_dict[alg_name][ fold_idx] = selected_feats else: if alg_name not in selected_feats_dict: selected_feats_dict[alg_name] = dict() if fold_idx not in selected_feats_dict[alg_name]: selected_feats_dict[alg_name][fold_idx] = [] selected_feats_dict[alg_name][fold_idx].append( 'PER_FEAT_' + str(per_feat)) selected_feats_dict[alg_name][fold_idx].extend( selected_feats) print(selected_feats_dict) if alg_name not in fold_returned_dict: fold_returned_dict[alg_name] = dict() if per_feat not in fold_returned_dict[alg_name]: fold_returned_dict[alg_name][per_feat] = list() fold_returned_dict[alg_name][per_feat].append(selected_feats) scores = all_clf_evaluator(X_train, X_test, y_train, y_test, feature_names, selected_feats) alg_dict[per_feat] = scores fold_dict[alg_name] = alg_dict results[fold_idx] = fold_dict fold_idx += 1 # update the fold index print('finish calculating results') print('Start writing results to ', dir_out, ' ....') every_fold_scores = dict() for alg_name in fold_returned_dict.keys(): if alg_name not in every_fold_scores: every_fold_scores[alg_name] = dict() for per_feat in fold_returned_dict[alg_name].keys(): every_fold_scores[alg_name][per_feat] = get_smilarity_scores( fold_returned_dict[alg_name][per_feat], len(feature_names)) # Update retured results, every fold have the same similarity scores for fold_idx in results.keys(): for alg_name in every_fold_scores.keys(): for per_feat in every_fold_scores[alg_name].keys(): for key in every_fold_scores[alg_name][per_feat].keys(): results[fold_idx][alg_name][per_feat][ key] = every_fold_scores[alg_name][per_feat][key] write_results(results, dir_out, selected_feats_dict) print('finish writing result')