def select_multisurf(X, y, percentile=10): num = math.ceil(X.shape[0] * percentile / 100) selector = MultiSURF(n_features_to_select=num, discrete_threshold=3, n_jobs=-1) selector.fit(X, y) return selector
def fit(self, X, y=None, **kwargs): X, y = self.check_X_y(X, y) self.check_params(X, y) selector = MultiSURF(n_features_to_select=self.num_features, ) selector.fit(X, y) _support = selector.top_features_[:self.num_features] self.support = self.check_support(_support) return self
def multisurf_fs(X_df,X_train_all,X_test_all,y_train): '''MultiSURF for feature selection''' fs = MultiSURF(discrete_threshold = 1000, n_jobs=1) fs.fit(X_train_all, y_train) feature_scores = fs.feature_importances_ feature_ids = np.where(feature_scores>=0)[0] selected_features = np.array(X_df.columns[feature_ids]) #New X_train and X_test matrices X_train = X_train_all[:,feature_ids] X_test = X_test_all[:,feature_ids] return selected_features, feature_scores, X_train, X_test
def test_multisurf_pandas_inputs(): """Check: Data (pandas DataFrame/Series): MultiSURF works with pandas DataFrame and Series inputs""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7
def test_multisurf_pipeline(): """Ensure that MultiSURF works in a sklearn pipeline when it is parallelized""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_multisurf_pipeline_parallel(): """Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline when MultiSURF is parallelized""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_multisurf_pipeline_cont_endpoint(): """Check: Data (Continuous Endpoint): MultiSURF works in a sklearn pipeline""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
def test_multisurf_pipeline_mixed_attributes(): """Check: Data (Mixed Attributes): MultiSURF works in a sklearn pipeline""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
def test_multisurf_pipeline_missing_values(): """Ensure that MultiSURF works in a sklearn pipeline with missing values""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1), Imputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score( clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
def run_multisurf(xTrain, yTrain, cv_count, data_name, output_folder, randSeed, ordered_feature_names, algorithm): #Run mutlisurf filename = output_folder + '/' + algorithm + '_' + data_name + '_' + str( cv_count) + '_Train.txt' clf = MultiSURF().fit(xTrain, yTrain) scores = clf.feature_importances_ scoreDict, score_sorted_features = sort_save_fi_scores( scores, ordered_feature_names, algorithm, filename) return scores, scoreDict, score_sorted_features
def rebate(df, target, n_features): """ Run the ReBATE relief algorithm on a dataframe, returning the reduced df. Args: df (pandas.DataFrame): A dataframe target (str): The target key (must be present in df) n_features (int): The number of features desired to be returned. Returns: """ X = df.drop(target, axis=1) y = df[target] rf = MultiSURF(n_features_to_select=n_features, n_jobs=-1) matrix = rf.fit_transform(X.values, y.values) feats = [] for c in matrix.T: for f in X.columns.values: if np.array_equal(c, X[f].values) and f not in feats: feats.append(f) return df[feats]
def get_selector(name, estimator=None, n_features_to_select=None, **params): if name == 'RobustSelector': return RobustSelector(estimator, n_features_to_select=n_features_to_select, **search_dict(params, ('cv', 'verbose'))) elif name == 'MaxFeatures': return SelectFromModel(estimator, threshold=-np.inf, max_features=n_features_to_select) elif name == 'RandomSubsetSelector': return RandomSubsetSelector(estimator, n_features_to_select=n_features_to_select, **search_dict(params, ('n_subsets', 'subset_size', 'random_state'))) elif name == 'FeatureImportanceThreshold': return SelectFromModel(estimator, **search_dict(params, 'threshold')) elif name == 'RFE': return RFE(estimator, n_features_to_select=n_features_to_select, **search_dict(params, ('step', 'verbose'))) elif name == 'RFECV': return RFECV(estimator, n_features_to_select=n_features_to_select, **search_dict(params, ('step', 'cv', 'verbose'))) elif name == 'FoldChangeFilter': return FoldChangeFilter(**search_dict(params, ('threshold', 'direction', 'below', 'pseudo_count'))) elif name == 'ZeroFractionFilter': return ZeroFractionFilter(**search_dict(params, ('threshold',))) elif name == 'RpkmFilter': return RpkmFilter(**search_dict(params, ('threshold',))) elif name == 'RpmFilter': return RpmFilter(**search_dict(params, ('threshold',))) elif name == 'DiffExpFilter': return DiffExpFilter(max_features=n_features_to_select, **search_dict(params, ('threshold', 'script', 'temp_dir', 'score_type', 'method'))) elif name == 'ReliefF': from skrebate import ReliefF return ReliefF(n_features_to_select=n_features_to_select, **search_dict(params, ('n_jobs', 'n_neighbors', 'discrete_limit'))) elif name == 'SURF': from skrebate import SURF return SURF(n_features_to_select=n_features_to_select, **search_dict(params, ('n_jobs', 'discrete_limit'))) elif name == 'MultiSURF': from skrebate import MultiSURF return MultiSURF(n_features_to_select=n_features_to_select, **search_dict(params, ('n_jobs', 'discrete_limit'))) elif name == 'SIS': return SIS(n_features_to_select=n_features_to_select, **search_dict(params, ('temp_dir', 'sis_params'))) elif name == 'NullSelector': return NullSelector() else: raise ValueError('unknown selector: {}'.format(name))
def job(cv_train_path, experiment_path, random_state, class_label, instance_label, instance_subset, algorithm, njobs, use_TURF, TURF_pct): job_start_time = time.time() random.seed(random_state) np.random.seed(random_state) dataset_name = cv_train_path.split('/')[-3] data = pd.read_csv(cv_train_path, sep=',') if instance_label != 'None': dataFeatures = data.drop([class_label, instance_label], axis=1).values else: dataFeatures = data.drop([class_label], axis=1).values dataOutcome = data[class_label].values header = data.columns.values.tolist() header.remove(class_label) if instance_label != 'None': header.remove(instance_label) cvCount = cv_train_path.split('/')[-1].split("_")[-2] use_TURF = use_TURF != 'False' #Mutual Information if algorithm == 'mi': #Run Mutual Information outname = "mutualinformation" outpath = experiment_path + '/' + dataset_name + "/" + outname + "/scores_cv_" + str( cvCount) + '.csv' scores = mutual_info_classif(dataFeatures, dataOutcome, random_state=random_state) #MultiSURF elif algorithm == 'ms': #Format instance sampled dataset (prevents MultiSURF from running a very long time in large instance spaces) formatted = np.insert(dataFeatures, dataFeatures.shape[1], dataOutcome, 1) choices = np.random.choice(formatted.shape[0], min(instance_subset, formatted.shape[0]), replace=False) newL = [] for i in choices: newL.append(formatted[i]) formatted = np.array(newL) dataFeatures = np.delete(formatted, -1, axis=1) dataPhenotypes = formatted[:, -1] #Run MultiSURF outname = "multisurf" outpath = experiment_path + '/' + dataset_name + "/" + outname + "/scores_cv_" + str( cvCount) + '.csv' if use_TURF: clf = TURF(MultiSURF(n_jobs=njobs), pct=TURF_pct).fit(dataFeatures, dataPhenotypes) else: clf = MultiSURF(n_jobs=njobs).fit(dataFeatures, dataPhenotypes) scores = clf.feature_importances_ else: raise Exception("Feature importance algorithm not found") #Save sorted feature importance scores: scoreDict, score_sorted_features = sort_save_fi_scores( scores, header, outpath, outname) #Save CV MI Scores to pickled file if not os.path.exists(experiment_path + '/' + dataset_name + "/" + outname + "/pickledForPhase4"): os.mkdir(experiment_path + '/' + dataset_name + "/" + outname + "/pickledForPhase4") outfile = open( experiment_path + '/' + dataset_name + "/" + outname + "/pickledForPhase4/" + str(cvCount), 'wb') pickle.dump([scores, scoreDict, score_sorted_features], outfile) outfile.close() #Save Runtime runtime_file = open( experiment_path + '/' + dataset_name + '/runtime/runtime_' + outname + '_CV_' + str(cvCount) + '.txt', 'w') runtime_file.write(str(time.time() - job_start_time)) runtime_file.close() # Print completion print(dataset_name + " CV" + str(cvCount) + " phase 3 " + outname + " evaluation complete") job_file = open( experiment_path + '/jobsCompleted/job_' + outname + '_' + dataset_name + '_' + str(cvCount) + '.txt', 'w') job_file.write('complete') job_file.close()
def rank_features_by_rebate_methods(data_split_list, fs_method, iterate, remove_percent=0.1, verbose=False): ## 0. Input arguments: # data_split_list: data frame that contains the learning data # fs_method: feature ranking methods to be used: 'SURF', 'SURFstar', 'MultiSURF', or 'MultiSURFstar' # iterate: whether to implement TURF: True or False (TURF will remove low-ranking features after each iteration, effective when #features is large) # remove_percent: percentage of features removed at each iteration (only applied when iterate = True) # verbose: whether to show progress by each fold: True of False ## 1. Define function for feature ranking method # SURF if fs_method == 'SURF': # Implement TURF extension when 'iterate == True' if iterate == True: fs = TuRF(core_algorithm='SURF', pct=remove_percent) else: fs = SURF() # SURFstar if fs_method == 'SURFstar': if iterate == True: fs = TuRF(core_algorithm='SURFstar', pct=remove_percent) else: fs = SURFstar() # MultiSURF if fs_method == 'MultiSURF': if iterate == True: fs = TuRF(core_algorithm='MultiSURF', pct=remove_percent) else: fs = MultiSURF() # MultiSURFstar if fs_method == 'MultiSURFstar': if iterate == True: fs = TuRF(core_algorithm='MultiSURFstar', pct=remove_percent) else: fs = MultiSURFstar() ## 2. Perform feature ranking on each fold of training data # iterate by folds feat_impt_dict = {} for i in range(0, len(data_split_list)): # intermediate output if verbose == True: print('Computing feature importance scores using data from fold ' + str(i) + '\n') # obtain training feature matrix and response vector feat_train, label_train, _, _ = data_split_list[i] # fit feature ranking model using the specified method if iterate == True: fs.fit(feat_train.values, label_train.values, list(feat_train)) else: fs.fit(feat_train.values, label_train.values) # output feature importance scores in a data frame fold_name = 'Fold_' + str(i) feat_impt_dict[fold_name] = fs.feature_importances_ # aggregate results from muliple folds into one data frame feat_impt_df = pd.DataFrame(feat_impt_dict) feat_impt_df.index = feat_train.columns return feat_impt_df
clf = RandomForestClassifier().fit(X_train, y_train) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) return np.mean(acc_arr), chosen_names try: X, y, names, groups, _ = ml_data_parser(argv[1]) except IndexError: X, y, names, groups, _ = ml_data_parser('30_data.csv') # X,y,names,groups,_=ml_data_parser('74_data.csv') # print('graph_threshold,multisurf_threshold,model_type,model_acc,num_vars') X = np.array(X) y = np.array(y) num_vars = len(names) fs = MultiSURF().fit(X, y) ms_array = list(fs.feature_importances_) feature_importance = {} num_dic = {} trans_x = np.transpose(X) max_val = 0 for i in range(num_vars): feature_importance[names[i]] = ms_array[i] num_dic[i] = ms_array[i] if max_val < num_dic[i]: max_val = num_dic[i] best_feature = i for a in range(10): x1 = X[:, best_feature].reshape(-1, 1) group_kfold = GroupKFold(n_splits=10) group_kfold.get_n_splits(x1, y, groups)
def job(experiment_path,cv): job_start_time = time.time() file = open(experiment_path+'/phase1pickle', 'rb') phase1_pickle = pickle.load(file) file.close() cv_info = phase1_pickle[0] learning_iterations = phase1_pickle[3] N = phase1_pickle[4] nu = phase1_pickle[5] attribute_tracking_method = phase1_pickle[6] random_state = phase1_pickle[7] class_label = phase1_pickle[8] feature_selection_sample_size = phase1_pickle[10] rule_compaction_method = phase1_pickle[11] data_headers = phase1_pickle[1][2] train_data_features = cv_info[cv][0] train_data_phenotypes = cv_info[cv][1] train_instance_labels = cv_info[cv][2] train_group_labels = cv_info[cv][3] test_data_features = cv_info[cv][4] test_data_phenotypes = cv_info[cv][5] test_instance_labels = cv_info[cv][6] test_group_labels = cv_info[cv][7] inst_label = cv_info[cv][8] group_label = cv_info[cv][9] # Create CV directory if not os.path.exists(experiment_path + '/CV_' + str(cv)): os.mkdir(experiment_path + '/CV_' + str(cv)) #MultiSURF Feature Scoring merged = np.insert(train_data_features, train_data_features.shape[1], train_data_phenotypes, 1) rb_sample = np.random.choice(merged.shape[0], min(feature_selection_sample_size,merged.shape[0]), replace=False) new_data = [] for i in rb_sample: new_data.append(merged[i]) new_data = np.array(new_data) data_featuresR = np.delete(new_data, -1, axis=1) data_phenotypesR = new_data[:, -1] featureimportance_model = MultiSURF() featureimportance_model.fit(data_featuresR, data_phenotypesR) scores = featureimportance_model.feature_importances_ # Train ExSTraCS Model model = ExSTraCS(learning_iterations=learning_iterations, N=N, nu=nu,attribute_tracking_method=attribute_tracking_method, rule_compaction=rule_compaction_method,random_state=random_state,do_correct_set_subsumption=False,expert_knowledge=scores) model.fit(train_data_features, train_data_phenotypes) outfile = open(experiment_path + '/CV_' + str(cv) + '/model', 'wb') pickle.dump(model, outfile) outfile.close() # Export Testing Accuracy for each instance predicted_data_phenotypes = model.predict(test_data_features) equality = np.equal(predicted_data_phenotypes, test_data_phenotypes) with open(experiment_path + '/CV_' + str(cv) + '/instTestingAccuracy.csv', mode='w') as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([inst_label, 'isCorrect']) for i in range(len(test_instance_labels)): writer.writerow([test_instance_labels[i], 1 if equality[i] else 0]) file.close() # Export Aggregate Testing Accuracy outfile = open(experiment_path + '/CV_' + str(cv) + '/testingAccuracy.txt', mode='w') outfile.write(str(model.score(test_data_features, test_data_phenotypes))) outfile.close() # Save train and testing datasets into csvs with open(experiment_path + '/CV_' + str(cv) + '/trainDataset.csv', mode='w') as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(list(data_headers) + [class_label, inst_label, group_label]) for i in range(len(train_instance_labels)): writer.writerow(list(train_data_features[i]) + [train_data_phenotypes[i]] + [train_instance_labels[i]] + [ train_group_labels[i]]) file.close() with open(experiment_path + '/CV_' + str(cv) + '/testDataset.csv', mode='w') as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(list(data_headers) + [class_label, inst_label, group_label]) for i in range(len(test_instance_labels)): writer.writerow(list(test_data_features[i]) + [test_data_phenotypes[i]] + [test_instance_labels[i]] + [ test_group_labels[i]]) file.close() # Get AT Scores for each instance AT_scores = model.get_attribute_tracking_scores(instance_labels=np.array(train_instance_labels)) # Normalize AT Scores normalized_AT_scores = [] for i in range(len(AT_scores)): normalized = AT_scores[i][1] max_score = max(normalized) for j in range(len(normalized)): if max_score != 0: normalized[j] /= max_score else: normalized[j] = 0 normalized_AT_scores.append(list(normalized)) # Save Normalized AT Scores with open(experiment_path + '/CV_' + str(cv) + '/normalizedATScores.csv', mode='w') as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([inst_label] + list(data_headers)) for i in range(len(train_instance_labels)): writer.writerow([train_instance_labels[i]] + normalized_AT_scores[i]) file.close() # Save Runtime runtime_file = open(experiment_path + '/CV_' + str(cv) + '/runtime.txt', 'w') runtime_file.write(str(time.time() - job_start_time)) runtime_file.close() # Print completion print('CV '+str(cv) + " phase 1 complete")
def __init__(self, estimator=MultiSURF(), k=None): self.estimator = estimator self.k = k