def mrmr(self): if self.x.shape[2] == 32: mean_x = np.mean(self.x, axis=3, keepdims=True) mean_x = mean_x.reshape(mean_x.shape[0] * mean_x.shape[1], mean_x.shape[2]) cols = [ "Fp1", "AF3", "F3", "F7", "FC5", "FC1", "C3", "T7", "CP5", "CP1", "P3", "P7", "PO3", "O1", "Oz", "Pz", "Fp2", "AF4", "Fz", "F4", "F8", "Fc6", "Fc2", "Cz", "C4", "T8", "CP6", "CP2", "P4", "P8", "PO4", "O2" ] #names of 32 channels df = pd.DataFrame(mean_x, columns=cols) self.target = self.target.reshape( self.target.shape[0] * self.target.shape[1], 1) df["Class"] = self.target df["Class"] = df["Class"].astype(int) col = df.columns.tolist() col = col[ -1:] + col[: -1] #arranging the sequence of columns so class (target) is in first column and the rest are features (channels) df = df[col] return pymrmr.mRMR(df, self.scheme, self.channel) #mRMR can be "MID" or "MIQ" elif self.x.shape[2] == 62: mean_x = np.mean(self.x, axis=3, keepdims=True) mean_x = mean_x.reshape(mean_x.shape[0] * mean_x.shape[1], mean_x.shape[2]) cols = [ "FP1", "FPZ", "FP2", "AF3", "AF4", "F7", "F5", "F3", "F1", "FZ", "F2", "F4", "F6", "F8", "FT7", "FC5", "FC3", "FC1", "FCZ", "FC2", "FC4", "FC6", "FT8", "T7", "C5", "C3", "C1", "CZ", "C2", "C4", "C6", "T8", "TP7", "CP5", "CP3", "CP1", "CPZ", "CP2", "CP4", "CP6", "TP8", "P7", "P5", "P3", "P1", "PZ", "P2", "P4", "P6", "P8", "PO7", "PO5", "PO3", "POZ", "PO4", "PO6", "PO8", "CB1", "O1", "OZ", "O2", "CB2" ] #names of 62 channels df = pd.DataFrame(mean_x, columns=cols) self.target = self.target.reshape( self.target.shape[0] * self.target.shape[1], 1) df["Class"] = self.target df["Class"] = df["Class"].astype(int) col = df.columns.tolist() col = col[ -1:] + col[: -1] #arranging the sequence of columns so class (target) is in first column and the rest are features (channels) df = df[col] return pymrmr.mRMR(df, self.scheme, self.channel) #mRMR can be "MID" or "MIQ"
def processing_mrmr(df, n_components, mrmr_method='MIQ'): top_features = pymrmr.mRMR(df, mrmr_method, n_components) if 'DX' in top_features: n_components = n_components + 1 print('Issue with MRMR - need next feature') top_features = pymrmr.mRMR(df, mrmr_method, n_components) if 'DX' not in top_features: top_features.append('DX') return df[top_features], top_features
def mRMR(self, X_train, X_test, y_train, feat_names, **kwargs): outliers = kwargs["outliers"] n_bins = kwargs["n_bins"] method = kwargs["method"] retain_ratio = kwargs["retain_ratio"] top_n = int(retain_ratio * len(feat_names)) if top_n is None: top_n = X_train.shape[1] if y_train.dtype != int: le = LabelEncoder() y_train = le.fit_transform(y_train).astype(int) feat_names = list(feat_names) df = pd.DataFrame(np.hstack((y_train[:, np.newaxis], X_train)), columns=["label"] + feat_names) df_bin = df.copy() for f in feat_names: series = df[f] if outliers: # remove outliers binning in 1st<>99th percentile if not np.all(series.values == series.values[0]): # only do this step when series is made by at least 2 different values, otherwise something crashes _, bins = pd.qcut(series + self.jitter(series), np.linspace(0, 1, 100), retbins=True) first_perc, ninetyninth_perc = bins[0], bins[-1] series = np.maximum(series, first_perc) series = np.minimum(series, ninetyninth_perc) df_bin[f] = pd.cut(series, bins=n_bins, labels=np.arange(0, n_bins)) which_features = pymrmr.mRMR(df_bin, method, top_n) return df[which_features]
def FV_mRMR(self): print("\nrunning mRMR algorithm for feature selection") ae = AutoEncoder('fv_gmm', 0) with smart_open(os.path.join(ae.save_dir, 'model_list.txt'), 'rb', encoding='utf-8') as model_path: for line_no, line in enumerate(model_path): line = str(line).replace('\n', '') print(line_no, '\t', line[65:]) if os.path.isfile( os.path.join( line, 'fisher_vector_train_%d.npy' % self.kernel)) and os.path.isfile( os.path.join( line, 'fisher_vector_dev_%d.npy' % self.kernel)): X_train = np.load( os.path.join( line, 'fisher_vector_train_%d.npy' % self.kernel)) X_dev = np.load( os.path.join(line, 'fisher_vector_dev_%d.npy' % self.kernel)) y_train = np.load(os.path.join(line, 'label_train.npy')) y_dev = np.load(os.path.join(line, 'label_dev.npy')) X_train = np.reshape(X_train, (-1, np.prod(X_train.shape[1:]))) X_dev = np.reshape(X_dev, (-1, np.prod(X_dev.shape[1:]))) X_train = np.nan_to_num(X_train) X_dev = np.nan_to_num(X_dev) df = pd.DataFrame(np.vstack((X_train, X_dev))) df.columns = [ 'feature_%d' % i for i in range(len(X_train[0])) ] df.insert(0, 'label', np.hstack((y_train, y_dev)).T) print(df.head()) feature_list = pymrmr.mRMR(df, 'MIQ', 50) np.save(os.path.join(line, 'feature_list'), feature_list) X_train_df = pd.DataFrame(X_train) X_train_df.columns = [ 'feature_%d' % i for i in range(len(X_train[0])) ] X_train = X_train_df.loc[:, feature_list] X_dev_df = pd.DataFrame(X_dev) X_dev_df.columns = [ 'feature_%d' % i for i in range(len(X_dev[0])) ] X_dev = X_dev_df.loc[:, feature_list] print(X_train.head()) print(X_dev.head()) np.save(os.path.join(line, 'X_train_mrmr'), X_train) np.save(os.path.join(line, 'X_dev_mrmr'), X_dev) print("\nfeature selection done and data saved.")
def execute(data, cols): max_features = len(cols) print("====== mRMR Feature Ranking =====") ranking = pymrmr.mRMR(data, 'MID', max_features) #return ranking return '-- Not working --'
def get_filtered_data_frame_columns(df: pd.DataFrame, mrmr=False, features_left_cnt=10): if features_left_cnt >= len(df.columns) - 1: return df.columns if mrmr and len(df.columns) - features_left_cnt < 10: import pymrmr return [df.columns.values[0]] + pymrmr.mRMR( df, 'MID', features_left_cnt) else: data = df.to_numpy() correlations = feature_selection.mutual_info_regression( data[:, 1:], data[:, 0]) treshold = sorted(correlations, reverse=True)[features_left_cnt] columns = [] for i, col in enumerate(df.columns[1:]): if len(columns ) < features_left_cnt and correlations[i] > treshold: columns.append(col) for i, col in enumerate(df.columns[1:]): if len(columns ) < features_left_cnt and correlations[i] == treshold: columns.append(col) return [df.columns.values[0]] + columns
def GetSelectedFeatureIndex(self, data_container): data = data_container.GetArray() data /= np.linalg.norm(data, ord=2, axis=0) label = data_container.GetLabel() if data.shape[1] < self.GetSelectedFeatureNumber(): print( 'mMRM: The number of features {:d} in data container is smaller than the required number {:d}' .format(data.shape[1], self.GetSelectedFeatureNumber())) self.SetSelectedFeatureNumber(data.shape[1]) feature_list = ['class'] + data_container.GetFeatureName() feature_index = [] pd_label = pd.DataFrame(label) pd_data = pd.DataFrame(data) mRMR_input = pd.concat([pd_label, pd_data], axis=1) mRMR_input.columns = feature_list parameter_list = self.LoadFeatureSelectorParameterList( relative_path=r'HyperParameters\FeatureSelector') feature_name = pymrmr.mRMR(mRMR_input, parameter_list[0]['mutual_information'], self.GetSelectedFeatureNumber()) feature_list.remove('class') rank = [] for index, item in enumerate(feature_name): feature_index.append(feature_list.index(item)) rank.append(index) return feature_index, rank, feature_name
def select_features(X,y,modality,method,n_feats): if method == 'mrmr': if modality == 'gene' or modality == 'meth': #for these doing prefiltering with ttest # selector = VarianceThreshold(threshold=.025) # selector.fit(X) # ndx = selector.get_support(indices=True) # feat_keep = [] # for i in range(X.shape[1]): # if i in ndx: # feat_keep.append(list(X)[i]) # X = X.loc[:, feat_keep] init_feats = reduce(X, y, 2000) X = X.loc[:, init_feats] elif modality == 'CNV': # X, y = discretize(X, y, modality, 1) init_feats = chi(X, y, 2000) X = X.loc[:, init_feats] # calls helper function to discretize X,y = discretize(X,y,modality,.5) #4th param is number std away from mean as discretization threshold print('check 2') # combine response with features to one dataframe [y,X] z = pd.concat([y, X], axis=1) # calling mRMR function feat_selected = pymrmr.mRMR(z,'MIQ',n_feats) elif method == 'ttest': feat_selected = reduce(X, y, n_feats) elif method == 'chi-squared': # X,y = discretize(X,y,modality,.3) feat_selected = chi(X, y, n_feats) elif method == 'minfo': # selector = VarianceThreshold(threshold=.03) # selector.fit(X) # ndx = selector.get_support(indices=True) # feat_keep = [] # for i in range(X.shape[1]): # if i in ndx: # feat_keep.append(list(X)[i]) # X = X.loc[:,feat_keep] if modality == 'miRNA': X, y = discretize(X, y, modality, 2) if modality == 'gene' or modality == 'meth': X, y = discretize(X, y, modality, 1) init_feats = chi(X, y, 5000) X = X.loc[:, init_feats] if modality == 'CNV': # X, y = discretize(X, y, modality, 1) init_feats = chi(X, y, 5000) X = X.loc[:, init_feats] feat_selected = minfo(X,y,n_feats) return (feat_selected)
def mRMR(self): df_ = self.data.copy() cols = list(df_.columns)[:-1] + ['class'] df_.columns = cols if self.type == CLASSIFICATION: features_ = pymrmr.mRMR(df_, 'MID', self.num_top_features) self.report_feature_importance(features_, self.num_top_features, label="mMRM - MID") features_ = pymrmr.mRMR(df_, 'MIQ', self.num_top_features) self.report_feature_importance(features_, self.num_top_features, label="mMRM - MIQ") else: print( "mRMR is designed to be used in for classification, not regression " )
def mRMR(x_train, y_train, n_features): x_train.insert(loc=0, column='class', value=y_train) features = pymrmr.mRMR(x_train, 'MIQ', n_features) column_name = x_train.columns.tolist() results = [] for feature_index in features: idx = column_name.index(feature_index) results.append(idx) return results
def fit(self, X, y): n, d = X.shape # Creating a dataFrame vectors = np.concatenate([y[:, None], X], axis=1) columns = ["label"] + [str(x) for x in range(d)] df = pd.DataFrame(vectors, columns=columns) with silence(): output = pymrmr.mRMR(df, 'MIQ', self.k) self.index = np.array([int(x) for x in output]) return self
def dim_reduction_mRMR(df, k): import pymrmr s = datetime.now() reduced = pymrmr.mRMR(df, 'MIQ', k) func.execution_time(s) # finish Beep sound func.beeep() # below is how we can prepare the data to use mRMR #ordinal_df = pd.DataFrame(pd.np.column_stack([y,X_pad])) #ordinal_df.rename(columns={0:'class'}, inplace=True) #for i in range(1,1001): # ordinal_df.rename(columns={i:'Feat'+str(i)}, inplace=True) #reduced_ordf = prp.dim_reduction_mRMR(ordinal_df) return reduced
def fit(self, X, y): X_frame = pd.DataFrame(X, index=list(y.index), columns=[str(i) for i in range(X.shape[1])]) self.selected_mask = [] data_frame = pd.concat([y, X_frame], axis=1) all_features = X_frame.columns.tolist() selected_features = pymrmr.mRMR(data_frame, self.selection_method, self.selected_num) for i in range(len(all_features)): if all_features[i] in selected_features: self.selected_mask.append(True) else: self.selected_mask.append(False) return self
def mrmr_feature(csv_path, feature_number_list): df = pd.read_csv(csv_path) for feature_number in feature_number_list: result = pymrmr.mRMR(df, 'MIQ', feature_number) book = xlwt.Workbook() sheet1 = book.add_sheet(u'sheet1', cell_overwrite_ok=True) for i in range(len(result)): name = result[i] for j in range(len(df[name]) + 1): if j == 0: sheet1.write(j, i, name) else: sheet1.write(j, i, float(df[name][j - 1]))
def _mRMR(self, n, method='MIQ', is_discrete=True, nscale=1): ''' minimum Redundancy Maximum Relevance algorithm ''' sX = self.X.copy() if not is_discrete: log.info(f'Discretising X using scale = scale * {nscale}') sX = discretise(sX, nscale) sX.insert(0, self.y.columns[0], self.y.iloc[:, 0]) log.info(f'Starting mRMR ({method}, n={n})') feats = pymrmr.mRMR(sX, 'MIQ', n) log.info(f'Updating dataset, {len(feats)} features') self.X = self.X[feats]
def findBestFeaturesMRMR(self): feature_set = dict() ''' Finding features iteratively from 1 to 15 ''' for i in range(0, len(self.features)): feature_set[i] = pymrmr.mRMR(self.data, 'MID', i + 1) print(len(feature_set)) index_feature_set = dict() for key, value in feature_set.items(): index_feature_set[key] = list() for v in value: index_feature_set[key].append(list(self.data.columns).index(v)) ''' Cross-validation to find the best set of features ''' loss = 100 index = 0 for i in range(0, 15): model = self.model kf = KFold(n_splits=4) total_loss = 0 for train_index, test_index in kf.split(self.data.iloc[:, 1:]): train_X, test_X = self.data.iloc[ train_index, index_feature_set[i]], self.data.iloc[test_index, index_feature_set[i]] train_y, test_y = self.data.iloc[train_index, 0], self.data.iloc[test_index, 0] model.fit(train_X.values, list(train_y.values)) y_pred = model.predict_proba(test_X.values) total_loss += log_loss(list(test_y.values), y_pred) if (total_loss / 4) < loss: loss = total_loss index = i final_features = list() for x in index_feature_set[index]: final_features.append(self.data.columns[x]) return final_features
def mRMR_sel(X_tr, X_te, y_tr, k, feat_name): X_tr, X_te, feat_name = select_fs_alg('anova', X_tr, X_te, y_tr, 500, feat_name) if X_tr.shape[1] < k: X_t = X_tr mr_feat = feat_name X_te = X_te return X_t, X_te, mr_feat data = np.concatenate([np.expand_dims(y_tr, 1), X_tr], axis=1) fin_name = np.hstack((np.array('tar'), feat_name)) df = pd.DataFrame(data, columns=fin_name) df_te = pd.DataFrame(X_te, columns=feat_name) mr_feat = pymrmr.mRMR(df, 'MIQ', k) X_t = np.array(df[mr_feat]) X_te = np.array(df_te[mr_feat]) return X_t, X_te, mr_feat
def select_features(X, y, modality, method, n_feats): if method == 'mrmr': if modality == 'gene' or modality == 'meth': #for these doing prefiltering with ttest init_feats = reduce(X, y, 2000) X = X.loc[:, init_feats] elif modality == 'CNV': X, y = discretize(X, y, modality, 1) init_feats = chi(X, y, 2000) X = X.loc[:, init_feats] # calls helper function to discretize X, y = discretize( X, y, modality, .5 ) #4th param is number std away from mean as discretization threshold z = pd.concat([y, X], axis=1) # calling mRMR function feat_selected = pymrmr.mRMR(z, 'MIQ', n_feats) elif method == 'ttest': feat_selected = reduce(X, y, n_feats) elif method == 'chi-squared': X, y = discretize(X, y, modality, .3) feat_selected = chi(X, y, n_feats) elif method == 'minfo': if modality == 'miRNA': X, y = discretize(X, y, modality, 2) elif modality == 'gene' or modality == 'meth': X, y = discretize(X, y, modality, 1) init_feats = chi(X, y, 5000) X = X.loc[:, init_feats] elif modality == 'CNV': X, y = discretize(X, y, modality, 1) init_feats = chi(X, y, 1000) X = X.loc[:, init_feats] feat_selected = minfo(X, y, n_feats) return (feat_selected)
def select_features(X, y, selection_algorithm="mRMR", num_of_features=10): selection_algorithm = selection_algorithm.lower() assert selection_algorithm.lower() in ( "mrmr", "select_k_best", "rrelief"), "Invalid selection algorithm." # X_selected_features = X print( f"Selecting features with {selection_algorithm} selection algorithm...." ) if selection_algorithm == "mrmr": features = mRMR(X, 'MIQ', num_of_features) X_selected_features = X[features] elif selection_algorithm == "select_k_best": X_selected_features = SelectKBest(chi2, k=num_of_features).fit_transform( X, y) # raises KeyError - debug required else: r = relief.Relief( n_features=num_of_features ) # Will run by default on all processors concurrently X_selected_features = r.fit_transform(X, y) print("Feature selection finished....") return X_selected_features
def _choose_fea(self, filename, num): myfilename = filename + '.txt' data = pd.read_csv(myfilename, sep=' ') # 生成空的pandas表 self.val = pymrmr.mRMR(data, 'MID', num) print(self.val) numFeat = len(open(myfilename).readline().split(' ')) dataset = [] fr = open(myfilename) j = 0 for line in fr.readlines(): xi = [] curline = line.strip().split(' ') if j != 0: xi.append((curline[0])) for i in range(1, numFeat): ch = '%d' % i if self.val.count(ch) > 0: xi.append(float(curline[i])) dataset.append(xi) j = j + 1 chfilename = filename + 'ch' # print(np.array(dataset)) self.saveData(chfilename, np.array(dataset))
def fit(self, X, y): print('***** Fitting *****') # Check if DataFrame X = self.check_df(X) y = self.check_df(y) # print(f'X shape: {np.shape(X)}') # print(f'Y shape: {np.shape(y)}') # print(type(X), type(y)) # Compose new DataFrame feat_cols = [f'feat_{i}' for i in range(X.shape[1])] X_df = pd.DataFrame(data=X, columns=feat_cols) target = pd.Series(y, name='target') X_df = X_df.join(target) # Append labels to dataframe # Re-arrange the DataFrame so 'target' is the fisrt column ordered_cols = ['target'] + feat_cols X_df = X_df[ordered_cols] # Perform the feature selection using mRMR self.selected_features = pymrmr.mRMR(X_df, self.method, self.k_features) self.selected_indexes = [X_df.drop('target', axis='columns').columns.tolist().index(i) for i in self.selected_features] return self
def select_n_genes_mRMR(df, num_genes): return pymrmr.mRMR(df, 'MIQ', num_genes)
# import lightgbm as lgb import pandas as pd import pymrmr # python 使用3.6版本的 import warnings warnings.filterwarnings("ignore") train = pd.read_csv('../data/feature_filter_60.csv', nrows=50) y = pd.read_csv('../data/label_5.csv', nrows=50) train_x = train train_y = y test_x = pd.read_csv('../test/feature_filter_60.csv', nrows=50) test_y = pd.read_csv("../test/label_5_all.csv", nrows=50) print(train_x.shape, test_x.shape) data = pd.DataFrame( pd.concat((train_x, test_x)).drop(['o_x.26', 'o_y.26', 'o_z.26', 'yaw.26', 'o_w.26'], axis=1)).astype('int32') # data = data[0:len(data)].astype("int32") # .astype(str) label = pd.concat((train_y, test_y), axis=0) print(data.shape, label.shape) df = pd.concat((label, data), axis=1) print(df.shape) res = pymrmr.mRMR(data, 'MIQ', 500) print(res.shape)
y[1::2] = 0 # split set into training & test sets X, X_test, y, y_test = train_test_split(X, y, test_size=0.3, random_state=12345) nfeat_v = [10, 25, 50, 75, 100, 200] # ############################################################################# # Classification and ROC analysis print('%6s\t %6s\t %6s\t %6s\t %6s' % ('dset', 'nfeat', 'mean', 'stdev', 'pval')) feat_selector = mifs.MutualInformationFeatureSelector() for nfeat in nfeat_v: ind = pymrmr.mRMR(X, 'MIQ', 10) X_new = X[:, ind] # Run classifier with cross-validation and plot ROC curves cv = StratifiedKFold(n_splits=10, random_state=12345) classifier = LogisticRegression(C=1e15) tprs = [] aucs = [] aucs_tr = [] mean_fpr = np.linspace(0, 1, 100) i = 0 for train, test in cv.split(X_new, y): probas_ = classifier.fit(X_new[train], y[train]).predict_proba(X_new[test]) # Compute ROC curve and area the curve
#Random shuffle the rows of combined dataset outlierless_dt = outlierless_dt.sample(frac = 1,random_state=1).reset_index(drop=True) outlierless_dt.shape """**Feature Selection** * Select features using MrMr algorithm """ nsp = outlierless_dt[['NSP']] feature_dt = outlierless_dt.drop(['NSP'], axis = 1) feature_dt.insert(0, 'NSP', nsp) #For pymrmr module to select features the target column should be the first column in the dataframe feature_dt.head() feature_dt.shape mrmr_features = pymrmr.mRMR(feature_dt, 'MIQ', 10) type(mrmr_features) print(mrmr_features) final_dt = outlierless_dt[mrmr_features] final_dt.insert(0, 'NSP', nsp) final_dt.shape #final_dt contains the feature selected data """**Feature Engineering** 1. combine similar features 2. Extract PCA features **Start Analysis** **Support Vector Machine**
def get_maxrel_feature(dataframe, num_features, mode="MIQ"): feature_index = pymrmr.mRMR(dataframe, mode, num_features) important_feature_index = [int(x) for x in feature_index] return important_feature_index
def bench(self, X, X_norm, y, n=2): num_feats = 20 output_data = {'method': list(), 'features': list(), 'time': list(), self.test_att: list(), 'supervised': list()} # ---------------------------------------------------------------- # CFS # start = time.perf_counter() # idx = cfs(X_norm.to_numpy(), y.to_numpy())[0] # print(idx) # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() # output_data['method'].append('CFS') # output_data['time'].append(time.perf_counter() - start) # output_data['features'].append(selected_features) # output_data[self.test_att].append(self.train_real_data(selected_features, X)) # LA: Laplacian Score start = time.perf_counter() kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W.construct_W(X_norm.to_numpy(), **kwargs_W) score = lap_score.lap_score(X_norm.to_numpy(), W=W) idx = lap_score.feature_ranking(score) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('Laplacian Score') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # FCBF: Feature correlation based filter # start = time.perf_counter() # idx = fcbf(X_norm.to_numpy(), y.to_numpy(), n_selected_features=num_feats)[0] # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() # output_data['method'].append('FCBF') # output_data['time'].append(time.perf_counter() - start) # output_data['features'].append(selected_features) # output_data['supervised'].append(True) # output_data[self.test_att].append(self.train_real_data(selected_features, X)) # print(output_data) # output_data['method'].append('FCBF') # output_data['time'].append(9999999) # output_data['features'].append([]) # output_data['supervised'].append(True) # output_data[self.test_att].append(0.0) # UDFS: Unsupervised Discriminative Feature Selection start = time.perf_counter() Weight = udfs(X_norm.to_numpy(), gamma=0.1, n_clusters=n) idx = feature_ranking(Weight) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('UDFS') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # SPEC: Spectral Feature Selection start = time.perf_counter() score = spec(X_norm.to_numpy()) idx = feature_ranking_spec(score) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('SPEC') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # Mrmr: minimum redundency maximum relevance start = time.perf_counter() mrmr = pymrmr.mRMR(X_norm, 'MIQ', num_feats) output_data['method'].append('MRMR(MIQ)') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(mrmr) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(mrmr, X)) print(output_data) # Mrmr: minimum redundency maximum relevance start = time.perf_counter() mrmr = pymrmr.mRMR(X_norm, 'MID', num_feats) output_data['method'].append('MRMR(MID)') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(mrmr) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(mrmr, X)) print(output_data) # recursive feature elimination(RFE): from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5) start = time.perf_counter() rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X_norm.loc[:, rfe_support].columns.tolist() output_data['method'].append('RFE') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(rfe_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(rfe_feature, X)) print(output_data) # ---------------------------------------------------------------- # Lasso: SelectFromModel: from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats) start = time.perf_counter() embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X_norm.loc[:, embeded_lr_support].columns.tolist() output_data['method'].append('Lasso') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(embeded_lr_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(embeded_lr_feature, X)) print(output_data) print(str(len(embeded_lr_feature)), 'selected features') # ----------------------------------------------------------------------------- # Tree - based: SelectFromModel: from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats) start = time.perf_counter() embeded_rf_selector.fit(X_norm, y) embeded_rf_support = embeded_rf_selector.get_support() embeded_rf_feature = X_norm.loc[:, embeded_rf_support].columns.tolist() output_data['method'].append('Tree_Based_RF') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(embeded_rf_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(embeded_rf_feature, X)) print(output_data) print(str(len(embeded_rf_feature)), 'selected features') # ------------------------------------------------------------------------------- # also tree based: from sklearn.feature_selection import SelectFromModel from lightgbm import LGBMClassifier lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2, reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40) embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats) start = time.perf_counter() embeded_lgb_selector.fit(X_norm, y) embeded_lgb_support = embeded_lgb_selector.get_support() embeded_lgb_feature = X_norm.loc[:, embeded_lgb_support].columns.tolist() output_data['method'].append('Tree_Based_lightGBM') output_data['time'].append(time.perf_counter() - start) output_data['supervised'].append(True) output_data['features'].append(embeded_lgb_feature) output_data[self.test_att].append(self.train_real_data(embeded_lgb_feature, X)) print(output_data) print(str(len(embeded_lgb_feature)), 'selected features') return output_data
#x.remove(y) ##GBR #from h2o.estimators.gbm import H2OGradientBoostingEstimator #gbm = H2OGradientBoostingEstimator() #gbm.train(x=x, y=y, training_frame=train, validation_frame=valid) #y_pred=gbm.predict(test_data) ##gbm.cross_validation_models() ##gbm.cross_validation_metrics_summary() #gbm.varimp_plot() #gbm.varimp() #gbm.mse(train=True, valid=True, xval=False) #gbm.r2(train=True, valid=True, xval=False) result = pd.concat([y_train, train_data], axis=1) columns = pymrmr.mRMR(data, 'MIQ', 10) print(columns) new_data = [] new_data = pd.DataFrame(data=new_data) new_data_test = [] new_data_test = pd.DataFrame(data=new_data_test) for i in columns: new_data_test = pd.concat([new_data_test, data[i]], axis=1) for i in columns: new_data = pd.concat([new_data, data[i]], axis=1)
plt.ioff() bin_cutoff[feat] = np.histogram(featMatAll2[feat], bins='fd')[1] #use these to create bins for the cutting up the data - can input into pandas cut cat = pd.DataFrame() for feat in featMatAll2.columns: cat[feat]=pd.cut(featMatAll2[feat], bins= bin_cutoff[feat], \ labels = np.arange(1,len(bin_cutoff[feat])), include_lowest=True) #make ints cat2 = pd.DataFrame(data = np.array(cat.values), dtype = int, columns = cat.columns) #add in info about rows cat.insert(0, column = 'drug', value = featMatAll['drug'], allow_duplicates=True) #select 150 features using mRMR mrFeatsA = pymrmr.mRMR(cat2, 'MID', 150) #export these features as txt tile out = open(os.path.join(directoryA[:-7], 'mRMR_featsAgar.txt'), 'w') out.writelines(["%s\n" % item for item in mrFeatsA]) out.close() #so this is the mRMR selected feature set. mrFeatMatAll = pd.concat([featMatAll[mrFeatsA], featMatAll.iloc[:,-3:]], axis=1) #%% # We want to show that this feature set performs better than a random set of features #so use sss split again to pick out 150 features randomly, and do LDA (10CV'd) #need several loops to do this one
# -*- coding: utf-8 -*- """ Created on Mon Jun 10 10:04:56 2019 @author: Arif Shahriar 15201002 """ import numpy as np import pandas as pd import pymrmr df=pd.read_csv("Pymrmr_data.csv") df=df.drop(columns=['Timestamp','Rehab','ID']) print(df.head) featureName=pymrmr.mRMR(df,'MID',40) print("Number of Features is",len(featureName)) print(featureName) #from sklearn.datasets import make_classification # #from IPython.core.interactiveshell import InteractiveShell #InteractiveShell.ast_node_interactivity = "all" # #X, y = make_classification(n_samples=10000, # n_features=6, # n_informative=3, # n_classes=2, # random_state=0, # shuffle=False) # ## Creating a dataFrame #df = pd.DataFrame({'Feature 1':X[:,0], # 'Feature 2':X[:,1],