def calcular_mi(variable, df): """Calcula Mutual Information para las variables independientes utilizando sklearn.feature_selection.mutual_info_regression para un DataFrame FUNCIÓN SOLO VALIDA PARA EL DATAFRAME DEL TRABAJO ORIGINAL Parameters: variable (String): Variable sobre la cual se calcula mi df (pandas.DataFrame): Dataframe Returns: pandas.Series: Serie con los resultados de mi """ variables = [ 'Industry Sector', 'Application Group', 'Development Type', 'Development Platform', 'Language Type', 'Primary Programming Language', 'Functional Size', 'Adjusted Function Points', 'Project Elapsed Time', '1st Data Base System', 'Used Methodology' ] X = df.loc[:, variables] y = df.loc[:, variable].values mi = mutual_info_regression(X, y, n_neighbors=1) mi = pd.Series(mi) mi.index = X.columns mi = mi.sort_values(ascending=False) return mi
def get_collinear_cols_intradf(df1,df2,threshold=0.6,method="pearson",verbose=True): """ function to get the corresponding columns in two dataframes with collinear value ratio gt threshold """ from sklearn.feature_selection import mutual_info_classif,mutual_info_regression columns1 = list(set(df1.columns)-set(["id","user_id","label","target"])) columns2 = list(set(df2.columns)-set(["id","user_id","label","target"])) assert set(columns1)==set(columns2),"the columns in the two dataframes must be identical" df1 = df1[columns1] df2 = df2[columns2] cols_to_select = [] for col in columns1: ms = ["pearson", "kendall", "spearman","mi_classif","mi_regression"] assert method in ms,"method should in {}".format(ms) cor = 0.0 if method in ["pearson", "kendall", "spearman"]: cor = df1[col].corr(df2[col],method=method) elif method == "mi_classif": cor = np.reshape(mutual_info_classif(df1[[col]].values, df2[col].values), -1).tolist()[0] elif method == "mi_regression": cor = np.reshape(mutual_info_regression(df1[[col]].values, df2[col].values), -1).tolist()[0] if cor>threshold: cols_to_select.append(col) if verbose: print("column {}: {} {}".format(col,method,cor)) if verbose: print('Number of columns gt threshold {}: {}, out of {} columns : '.format(threshold,len(cols_to_select),len(columns1))) return cols_to_select
def get_mi_estimate(K, m, n, r, l, SNR, H, A, filename): TxNet, RxNet, Normalize, V, U, R = ae_load_model(K, filename, m, n, r, l, SNR, H, A) L1 = 10 L2 = 1000 X_data = np.zeros((L1 * L2, K)) Y_data = np.zeros((L1 * L2, K)) MI = np.zeros(K) for j in range(L1): _, SignalIn, _, Noise = prepare_data(K, m, n, r, V, U, R, SNR, L2) _, Rx = get_output(TxNet, RxNet, Normalize, K, SignalIn, Noise, H) for k in range(K): for t1 in range(L2): X_data[1000 * j + t1, k] = np.array(SignalIn[k])[t1, 0] Y_data[1000 * j + t1, k] = np.array(Rx[k])[t1, 0] for k in range(K): X = X_data[:, k].reshape(-1, 1) Y = np.squeeze(Y_data[:, k]) M = mutual_info_regression(X, Y) MI[k] = M return MI
def select_data(df_fewNA, num_features=50): """ Write selected_variables.txt with the name of all important features, determined with mutual information algorithm. Parameters ---------- df_fewNA: Output of read_database.clean_data. num_features: Maximum number of important variables to output. Returns ------- selected_variables: Dataframe with the name of all important features and its weight importance. """ # Feature selection covs = df_fewNA.drop(["NY_GDP_MKTP_KD_ZG", "residuals"], 1) Y = df_fewNA[['residuals']] info = mutual_info_regression(covs, np.ravel(Y)) df_varimp = pd.DataFrame(data={'name': covs.columns, 'varimp': info}) # Keep top50 selected_variables = df_varimp.sort_values(by="varimp", ascending=False)[0:49] selected_variables['name'] = selected_variables['name'].str.replace( '_', '.') selected_variables['name'].to_csv( path_or_buf='./utils/selected_variables.txt', header=True, index=None, sep='\t', mode='a') return selected_variables
def select_feature(x_train, x_test, y_train): """ This function reduces the number of features from the existing g.t 10,000 to something manageable. Based on experience with feature selection in homework 1, we do not expect the selection to result in improved performance. But we expect a reduction in run-time. No feature Run Time GPA : 320.58s Grit : 280.71 Hardship : 288.05 gpa : 37.22 Note : Code taken as is from homework 1 submission """ # feature selction-mutual info MIC = [] # Mutual info criteria MIC = feature_selection.mutual_info_regression(x_train, y_train) # get most descriptive features (here called good features) good_features = [] scores = [] for k in range(len(MIC)): scores.append(MIC[k]) if MIC[k] > 0.1: # Criteria for deciding that feature should be included good_features.append(k) # Adapt the training and testing matrices to good features x_train = x_train[:, good_features] x_test = x_test[:, good_features] print(len(good_features)) return x_train, x_test, scores
def test_mir(fn_in, fn_out='select_features_mir.tsv'): print('\nselect_features test_mir:') # np.random.seed(0) # data = np.random.rand(1000, 3) # y = data[:, 0] + np.sin(6 * np.pi * data[:, 1]) + 0.1 * np.random.randn(1000) df = utl.create_df(fn_in) data = df.values.astype('float32') col_names = list(df) X = data n = X.shape[1] with open(fn_out, 'w') as fout: for col in range(n): print(n, col, col_names[col]) y = data[:, col] mir = mutual_info_regression(X, y, n_neighbors=7) mir /= np.max(mir) line = str(col) + '\t' + '\t'.join( [str(format(x, '.3f')) for x in mir.tolist()]) + '\t' + col_names[col] print(line) fout.write(line + '\n') # normalized F-test for linearity # https://scikit-learn.org/stable/auto_examples/feature_selection/plot_f_test_vs_mi.html#sphx-glr-auto-examples-feature-selection-plot-f-test-vs-mi-py # f_test, p_val = f_regression(X, y) # f_test /= np.max(f_test) # print("f_score: \t", f_test) return
def make_mi_scores(X, y, discrete_features): mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features) mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns) mi_scores = mi_scores.sort_values(ascending=False) return mi_scores
def mutual_information(self): """ Calculates the mutual information for each distribution and appends them to a global variable I Returns -------- List[float, List[String,]] - A list of the mutual information paired with its respective distribution """ _, names, _ = self.traces[0] size = len(names) mutual_information = [{} for _ in range(size)] for i in range(size): for trace, names, info in self.traces: discrete = ("int" in str(names[i])) alice = trace[names[i]] try: output = trace["Output"] except: pos = int(str(names[i]).split("_")[-1]) if pos < 10: output = trace[f"Output_{pos}"] else: continue I_ao = mutual_info_regression([[j] for j in alice], output, discrete_features=discrete)[0] while (len(info) == 1): info = info[0] # Used to unwrap the inner information in case of subtypes such as List[List[Tuple[...]]] if isinstance(info, tuple) or (isinstance(info, list) and isinstance(info[0], list)): info = info[i] if info[0] in mutual_information[i]: mutual_information[i][info[0]].append((I_ao,info)) else: mutual_information[i][info[0]] = [(I_ao, info)] self.I = mutual_information return mutual_information
def miregression_selected(self, x, y): ''' 用于分类模型,方差分析方法 Parameters ---------- x: 2维的DataFrame特征 y: 1维的Series因变量 Returns ------- xSelected: list, [(Mi, colName),......] Others ------ 方法有待推导 ''' x_remaining = list(x.columns) selected = [] if self.api == "sklearn": Mi = mutual_info_regression(x, y) for m, col in zip(Mi, x_remaining): selected.append((m, col)) elif self.api == "self": pass else: raise ValueError("api must be 'sklearn' or 'self' !") selected.sort(reverse=True) return selected
def mutualInformation(ds, threshold=1): """calc mutual information for all features and return list of features with dependancies, drop features from ds""" logging.info("***start mutualInformation") features_to_drop = [] for column in ds.columns.drop(LabelName): if (column in dictFeatures.keys()): miMat = mutual_info_regression(ds.drop([LabelName, column], axis=1), ds[column], discrete_features='auto', n_neighbors=3, copy=True, random_state=None) else: miMat = mutual_info_classif(ds.drop([LabelName, column], axis=1), ds[column], discrete_features='auto', n_neighbors=3, copy=True, random_state=None) #print(column, miMat) if (any(mi > threshold for mi in miMat)): ds.drop(column, axis=1, inplace=True) features_to_drop.append(column) print("mutualInformation:\n", features_to_drop) return features_to_drop
def get_filtered_data_frame_columns(df: pd.DataFrame, mrmr=False, features_left_cnt=10): if features_left_cnt >= len(df.columns) - 1: return df.columns if mrmr and len(df.columns) - features_left_cnt < 10: import pymrmr return [df.columns.values[0]] + pymrmr.mRMR( df, 'MID', features_left_cnt) else: data = df.to_numpy() correlations = feature_selection.mutual_info_regression( data[:, 1:], data[:, 0]) treshold = sorted(correlations, reverse=True)[features_left_cnt] columns = [] for i, col in enumerate(df.columns[1:]): if len(columns ) < features_left_cnt and correlations[i] > treshold: columns.append(col) for i, col in enumerate(df.columns[1:]): if len(columns ) < features_left_cnt and correlations[i] == treshold: columns.append(col) return [df.columns.values[0]] + columns
def printMetrics(self): print() print("=================================================") print("=========== METRICS =============================") print("Features (put in Pandas df): ", self.feature_names) print('Mean squared error: %.2f' % mean_squared_error(self.y_test, self.y_pred)) print("Explained variance score: ", explained_variance_score(self.y_test, self.y_pred)) mi = mutual_info_regression(self.x_train, self.y_train) mi = mi / np.max(mi) fr, pval = f_regression(self.x_train, self.y_train, center=True) # center? print("Mutual Information: ", mi) #fr = fr / np.max(fr) print("f_regression: ", fr) print("pval: ", pval) print("R2 score: ", self.r2_score) print("=========== END METRICS =========================") print("=================================================") print()
def predict_Cond_Entropy(self, X): from ..MMI.IC.AIC import TableEntropy from ..utils import mseEntropy, varEntropy, unifEntropy, ShannonEntropy n_var = X.shape[1] numCond = 2**(n_var - 1) cond_ent = np.zeros((n_var, numCond)) for Resp in range(n_var): cond_ent[Resp, 0] = ShannonEntropy(X[:, Resp]) for sI in range(1, numCond): subset = TableEntropy.subsetVector(n_var - 1, sI) subset = np.array(subset) cond = [] for element in subset: if element >= Resp: element += 1 cond.append(int(element)) # self.savefig() cond_ent[Resp, sI] = cond_ent[Resp, 0] - mutual_info_regression( X=X[:, cond], y=X[:, Resp], discrete_features=self.discrete_features, n_neighbors=self.n_neighbors, random_state=self.random_state)[0] return cond_ent
def get_top_1500(filtered_matrix, all_pmi, all_pairs): X = [] # labels Y = [] # targets X = np.array(filtered_matrix).transpose() # build pmi array pmi = [] for idx, pair in enumerate(all_pairs): pmi.append(all_pmi[pair]) Y = np.array(pmi) assert len(X) == len(Y) # mir start = time.time() mi = mutual_info_regression(X, Y) # summarize the selection of the attributes print("time taken for mi") print(time.time() - start) listed_mi = list(mi) tupled_mi = [] for idx, _ in enumerate(listed_mi): if str(_) != 'nan': tupled_mi.append(tuple((idx, _))) # sort by key's of tuple tupled_mi.sort(reverse=True, key=operator.itemgetter(1)) if len(tupled_mi) > 4000: tupled_mi = tupled_mi[:4000] return(tupled_mi)
def fun_extractERPfeatsMultivar(erpSynch, erpSynchFilt, Fs): """fun_extractERPfeatsMultivar(erpSynch,erpSynchFilt,Fs) extract multivariate features from L channels #erpSynch is a LxN matrix --> L channels with N samples each, from which synchrony measures are taken (PLV and correlation) #detrend and normalize erpSynch (f***s up PLV values in some cases if it's not detrended) #erpSynchFilt is the filtered version of erpSynch, for calculating the PLV """ from itertools import combinations from sklearn.feature_selection import mutual_info_regression erpSynch = (erpSynch.T - np.mean(erpSynch, axis=1)).T erpSynch = (erpSynch.T / np.std(erpSynch, axis=1)).T erpSynchFilt = (erpSynchFilt.T - np.mean(erpSynchFilt, axis=1)).T featsOut = dict() #for all channel pair combinations featsOut["combinations"] = list(combinations(range(erpSynch.shape[0]), 2)) #featsOut["Corr"] = np.zeros(len(featsOut["combinations"]),erpSynch.shape[1]) #featsOut["PLV"] = np.zeros(len(featsOut["combinations"]),erpSynch.shape[1]) #*** Coupling Measures *** #correlation (max value) featsOut["Corr"] = [ np.max(signal.correlate(erpSynch[ki[0], :], erpSynch[ki[1], :])) / erpSynch.shape[1] for ki in featsOut["combinations"] ] featsOut["CorrCoefs"] = np.corrcoef(erpSynch) featsOut["MI"] = [ mutual_info_regression(erpSynch[ki[0], :].reshape(-1, 1), erpSynch[ki[1], :]) for ki in featsOut["combinations"] ] #PLV phases = np.array([ np.angle(signal.hilbert(erpSynchFilt[ki, :])) for ki in range(erpSynchFilt.shape[0]) ]) featsOut["PLV"] = [ np.abs( np.sum(np.exp(1j * (phases[ki[0]] - phases[ki[1]]))) / phases.shape[1]) for ki in featsOut["combinations"] ] featsOut["PLVphase"] = [ np.mean(np.unwrap(phases[ki[0]] - phases[ki[1]], axis=0)) for ki in featsOut["combinations"] ] #Coherence featsOut["Coh"] = np.zeros([len(featsOut["combinations"])]) iind = 0 for ki in featsOut["combinations"]: Wxy, Cxy = signal.coherence(erpSynch[ki[0], :], erpSynch[ki[1], :], Fs, nperseg=128) featsOut["Coh"][iind] = np.mean(Cxy[0:11]) iind += 1 return featsOut
def nmi(X, y): """ Normalized mutual information between X and y. :param X: :param y: """ mi = mutual_info_regression(X, y) return mi / mi.max()
def get_params(self, X, y): self.m, self.n = X.shape self.r = y.shape[1] if len(y.shape) > 1 else 1 if self.sim == 'corr': self.Q = np.abs(get_corr_matrix(X, fill=1)) self.b = np.sum(np.abs(get_corr_matrix(X, y)), axis=1)[:, np.newaxis] elif self.sim == 'info': self.Q = np.ones([X.shape[1], X.shape[1]]) self.b = np.zeros((X.shape[1], 1)) for j in range(n_features): self.Q[:, j] = sklfs.mutual_info_regression(X, X[:, j]) if len(y.shape) == 1: self.b = sklfs.mutual_info_regression(X, y)[:, np.newaxis] else: for y_ in y: self.b += sklfs.mutual_info_regression(X, y_) self.Q, self.lamb_min = shift_spectrum(self.Q)
def getMI(df, LABEL): X = df.iloc[:, 0:-1] y = df[LABEL].values.flatten() mi = mutual_info_regression(X, y) mi /= np.max(mi) miDF = pd.DataFrame({'feature': X.columns.values}) miDF = miDF.assign(MI=mi) return miDF
def run_chrom(chrom, list_of_list, atac_chunk, rna_counts, inboth, masterdict): mis = [] rhos = [] masterdict[chrom] = [] for k in range(0, len(list_of_list[0])): print(' ') print(k) gene_idx = list_of_list[3][k][0] idxs = list_of_list[2][k] nonzero_rna = np.argwhere(rna_counts[:, gene_idx].todense() != 0) nonzero_atac = np.argwhere(atac_chunk[:, idxs].todense() != 0) print(np.size(nonzero_rna)) print(np.size(nonzero_atac)) inboth = np.intersect1d(nonzero_atac, nonzero_rna) cutoff = 3 if np.size(inboth) < 3: print(np.size(inboth)) print("Too few for a gene") mis.append('-') rhos.append('-') continue try: mi = mutual_info_regression(atac_chunk[inboth, idxs].todense(), rna_counts[inboth, gene_idx].reshape(-1, 1), random_state=2, n_neighbors=3, discrete_features=False) except: print("went here") mi = 0 mis.append(mi) rho = np.corrcoef( np.hstack(atac_chunk[inboth, idxs].todense().transpose(), rna_counts[inboth, gene_idx].reshape(-1, 1).transpose()))[-1, :] rhos.append(rho) # n = np.size(atac_chunk[inboth, idxs].todense().transpose()) # adjusted_rho = rho * hyp2f1(1 / 2, 1 / 2, (n - 1) / 2, 1 - rho ** 2) # rhos.append(adjusted_rho) print("GOOD GENE!") masterdict[chrom].append([mis, rhos]) return
def feature_importance_regression(features, target, n_neighbors=3, random_state=None): cont = features.select_dtypes(include=[np.floating]) disc = features.select_dtypes(include=[np.integer, np.bool]) cont_imp = pd.DataFrame(index=cont.columns) disc_imp = pd.DataFrame(index=disc.columns) # Continuous features if cont_imp.index.size > 0: # Pearson correlation pearson = np.array([stats.pearsonr(feature, target) for _, feature in cont.iteritems()]) cont_imp['pearson_r'] = pearson[:, 0] cont_imp['pearson_r_p_value'] = pearson[:, 1] # Mutual information mut_inf = feature_selection.mutual_info_regression(cont, target, discrete_features=False, n_neighbors=n_neighbors, random_state=random_state) cont_imp['mutual_information'] = mut_inf # Discrete features if disc_imp.index.size > 0: # F-test f_tests = defaultdict(dict) for feature in disc.columns: groups = [target[idxs] for idxs in disc.groupby(feature).groups.values()] statistic, p_value = stats.f_oneway(*groups) f_tests[feature]['f_statistic'] = statistic f_tests[feature]['f_p_value'] = p_value f_tests_df = pd.DataFrame.from_dict(f_tests, orient='index') disc_imp['f_statistic'] = f_tests_df['f_statistic'] disc_imp['f_p_value'] = f_tests_df['f_p_value'] # Mutual information mut_inf = feature_selection.mutual_info_regression(disc, target, discrete_features=True, n_neighbors=n_neighbors, random_state=random_state) disc_imp['mutual_information'] = mut_inf return cont_imp, disc_imp
def mrmrfilter(data,label,num): import numpy as np import pandas as pd from sklearn.feature_selection import mutual_info_regression np.random.seed(0) D1 = data.shape[0] D2 = data.shape[1] # get the mutual information within features MI_ff =np.zeros((D2,D2)) for i in range(D2): print('complete {} part of lines' .format(i/D2) ) for j in np.arange(i,D2): MI_ff[i,j] = mutual_info_regression(data.values[:,i].reshape(D1,1),data.values[:,j].reshape(D1,1), n_neighbors=5) for i in range(D2-1): for j in np.arange(i,D2): MI_ff[j,i] = MI_ff[i,j] # get the mutual information between features and label MI_fl = mutual_info_regression(data.values[:,:].reshape(D1,D2), label.values[:,0].reshape(D1,1), n_neighbors=5) # the best 50 features candidate = np.zeros((D2,1)) # 对应位置为1表示属于best 50 for i in range(num): if i == 0: k = (np.where(MI_fl == np.max(MI_fl)))[0] candidate[k] = 1 else: mrmr = np.zeros((D2,1)) for j in range(D2): mrmr[j] = (1-candidate[j])*(MI_fl[j]/i-np.sum(np.dot(MI_ff[j,:],candidate)/i**2))-1000*candidate[j] # 1000*candidate[j]惩罚项 k = (np.where(mrmr == np.max(mrmr,axis=0)))[0] candidate[k] = 1 data_num = np.zeros((D1,num)) count = 0 for i in range(D2): if candidate[i] == 1: data_num[:,count] = data.iloc[:,i] count += 1 return data_num
def loadscaledata(print_mi = False): x_all,y_all = loaddata() Xscaler = preprocessing.StandardScaler(copy=False).fit(x_all) Yscaler = preprocessing.StandardScaler(copy=False).fit(y_all) #Xscaler = preprocessing.MinMaxScaler((0,64),copy=False).fit(X_train) #Yscaler = preprocessing.MinMaxScaler((0,64),copy=False).fit(Y_train) x_all = Xscaler.transform(x_all) y_all = Yscaler.transform(y_all) if print_mi: mi_tof = mutual_info_regression(x_all,y_all[:,0]) mi_tof /= np.max(mi_tof) print('mi for tof time\t',mi_tof) mi_pos = mutual_info_regression(x_all,y_all[:,1]) mi_pos /= np.max(mi_pos) print('mi for y_position',mi_pos) return x_all,y_all,Xscaler,Yscaler
def plot_regression_categorical(X, target_col, types=None, **kwargs): """Plots for categorical features in regression. Creates box plots of target distribution for important categorical features. Relevant features are identified using mutual information. For high cardinality categorical variables (variables with many categories) only the most frequent categories are shown. Parameters ---------- X : dataframe Input data including features and target target_col : str or int Identifier of the target column in X types : dataframe of types, optional. Output of detect_types on X. Can be used to avoid recomputing the types. """ types = _check_X_target_col(X, target_col, types, task="regression") if types is None: types = detect_types(X) features = X.loc[:, types.categorical] if target_col in features.columns: features = features.drop(target_col, axis=1) if features.shape[1] == 0: return features = features.astype('category') show_top = _get_n_top(features, "categorical") # can't use OrdinalEncoder because we might have mix of int and string ordinal_encoded = features.apply(lambda x: x.cat.codes) target = X[target_col] f = mutual_info_regression(ordinal_encoded, target, discrete_features=np.ones(X.shape[1], dtype=bool)) top_k = np.argsort(f)[-show_top:][::-1] # large number of categories -> taller plot row_height = 3 if X.nunique().max() <= 5 else 5 fig, axes = _make_subplots(n_plots=show_top, row_height=row_height) plt.suptitle("Categorical Feature vs Target") for i, (col_ind, ax) in enumerate(zip(top_k, axes.ravel())): col = features.columns[i] X_new = _prune_category_make_X(X, col, target_col) medians = X_new.groupby(col)[target_col].median() order = medians.sort_values().index sns.boxplot(x=target_col, y=col, data=X_new, order=order, ax=ax) ax.set_title("F={:.2E}".format(f[col_ind])) # shorten long ticks and labels _short_tick_names(ax) for j in range(i + 1, axes.size): # turn off axis if we didn't fill last row axes.ravel()[j].set_axis_off()
def _mutual_info(reference, query): # mutual information is not symmetric. We don't need to normalise the vectors, it's invariant under scaling. from sklearn.feature_selection import mutual_info_regression weights = np.zeros((query.shape[1], reference.shape[1])) for i, target in enumerate(query.T): weights[i, :] = mutual_info_regression(reference, target) return weights
def calc_lowf_mutual_info(data, channels): x_dr = data[DIFF_REAL].to_numpy() x_di = data[DIFF_IMAG].to_numpy() x_ar = data[ABS_REAL].to_numpy() x_ai = data[ABS_IMAG].to_numpy() mi_dr = [] mi_di = [] mi_ar = [] mi_ai = [] for ch in channels: y = data[ch].to_numpy().reshape(-1, ) mi_dr.append(mutual_info_regression(x_dr, y)) mi_di.append(mutual_info_regression(x_di, y)) mi_ar.append(mutual_info_regression(x_ar, y)) mi_ai.append(mutual_info_regression(x_ai, y)) #now we can do some averaging over the high frequency range mi_dr_av = [] mi_di_av = [] mi_ar_av = [] mi_ai_av = [] for mi in mi_dr: mi_dr_av.append(np.average(mi[6:, ])) mi_di_av = [] for mi in mi_di: mi_di_av.append(np.average(mi[6:, ])) mi_ar_av = [] for mi in mi_ar: mi_ar_av.append(np.average(mi[6:, ])) mi_ai_av = [] for mi in mi_ai: mi_ai_av.append(np.average(mi[6:, ])) mutual_info = np.concatenate((mi_dr_av, mi_di_av, mi_ar_av, mi_ai_av)) return mutual_info
def do_mutual_information(weights, previous, config, **kwargs): assert weights.shape[0] == weights.shape[1] kernels = previous.shape[0] for i in range(kernels): for j in range(kernels): # print(f'Score kernel {i} and {j}') weights[i, j] = feature_selection.mutual_info_regression(previous[i].flatten().reshape(-1, 1), previous[j].flatten()) print(f'Kernel mean mutual information {np.mean(weights)}') return weights
def mutual_information_univariate_selection(X, y): """Given data instances (X) and their corresponding targets (y), this method indicates which features of X are most correlated with y according to a mutual information based univariate feature selection """ mi = mutual_info_regression(X, y) mi /= np.max(mi) dict_mi = {i: j for i, j in enumerate(mi)} print('ranked features -- mutual information', sorted(dict_mi, key=dict_mi.get, reverse=True))
def MI_features_selection(self, y, x): mi = mutual_info_regression(x, y, discrete_features='auto', n_neighbors=3, copy=True, random_state=None) #n_mi = np.argsort(np.sum(mi, axis = 1))[-n:][::-1] n_mi = mi.argsort()[-self.m:][::-1] return n_mi
def continuous_entropy(ys): """Compute continuous mutual entropy Args: ys: np.array num_points x num_attributes """ num_factors = ys.shape[1] h = np.zeros(num_factors) for j in tqdm(range(num_factors)): h[j] = mutual_info_regression(ys[:, j].reshape(-1, 1), ys[:, j]) return h
def count_difference(array_1, array_2, show_img=False): if show_img: plt.clf() plt.title("Porównywanie krzywych spektralnych, obliczanie różnicy") plt.plot(array_1, label="Array 1") plt.plot(array_2, label="Array 2") plt.legend() plt.axis('tight') plt.show() X = np.arange(len(array_1)) mi1 = mutual_info_regression(X.reshape(-1, 1), array_1) mi2 = mutual_info_regression(X.reshape(-1, 1), array_2) result = np.square(mi1 - mi2) # mi1 /= np.max(mi1) return result * 1000
imgy = imgy + dy + 20 os.remove(temppath) size(W, HEIGHT+dy+40) else: def pltshow(mplpyplot): mplpyplot.show() # nodebox section end np.random.seed(0) X = np.random.rand(1000, 3) y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000) f_test, _ = f_regression(X, y) f_test /= np.max(f_test) mi = mutual_info_regression(X, y) mi /= np.max(mi) plt.figure(figsize=(15, 5)) for i in range(3): plt.subplot(1, 3, i + 1) plt.scatter(X[:, i], y, edgecolor='black', s=20) plt.xlabel("$x_{}$".format(i + 1), fontsize=14) if i == 0: plt.ylabel("$y$", fontsize=14) plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]), fontsize=16) # plt.show() pltshow(plt)