def McOne(data, label, r): print("McOne start...") classLabel = label dataMat = data.values n = data.shape[0] micFC = [0] * n Subset = [-1] * n numSubset = 0 for i in range(n): m = MINE() m.compute_score(dataMat[i], classLabel) micFC[i] = m.mic() if micFC[i] >= r: Subset[numSubset] = i numSubset += 1 Subset = Subset[:numSubset] Subset.sort(key=lambda x: micFC[x], reverse=True) e = 0 while e <= numSubset - 1: q = e + 1 while q <= numSubset - 1: m = MINE() m.compute_score(dataMat[Subset[e]], dataMat[Subset[q]]) if m.mic() >= micFC[Subset[q]]: for i in range(q, numSubset - 1): Subset[i] = Subset[i + 1] numSubset -= 1 else: q += 1 e += 1 return data.iloc[Subset[:numSubset]]
def fit(self, X, y): # initialize phi and feature set # if number of features is not set, half of the features will be selected n = self.n beta = self.beta verbose = self.verbose if n == None: n = int(X.shape[0] / 2) features = np.arange(X.shape[1]).tolist() best_mi = -np.inf X_hat = 0 for xi in features: m = MINE() m.compute_score(X[:, xi], y) #compute I(xi,y) and get max xi mi_xi_y = m.mic() if best_mi < mi_xi_y: X_hat = xi phi = [X_hat] features.remove(X_hat) # get paris for elements in phi and features while len(phi) < n: mi_scores = np.zeros(len(features)) for xi_idx, xi in enumerate(features): m = MINE() m.compute_score(X[:, xi], y) #compute I(xi,y) mi_xi_y = m.mic() sum_mi_xi_xj = 0 for xj in phi: # compute I(xi,xj) and save for further evaluation m = MINE() m.compute_score(X[:, xi], X[:, xj]) mi_xi_xj = m.mic() sum_mi_xi_xj += mi_xi_xj mi_scores[xi_idx] = mi_xi_y - beta * sum_mi_xi_xj if verbose >= 2: print "mi_scores for xi:{xi}, xj:{xj} is {mi_scores}".format( xi=xi, xj=xj, mi_scores=mi_scores[xi_idx]) X_hat = np.argmax(mi_scores) if verbose == 1: print "X_hat is {X_hat}".format(X_hat=X_hat) X_hat = features[X_hat] phi.append(X_hat) features.remove(X_hat) self.phi = phi self.features = features
def _evaluate_single(data, target_feature): mine = MINE(alpha=0.3, c=15) MICs = list() for i in range(data.shape[1]): mine.compute_score(target_feature,data[:,i]) MICs.append(mine.mic()) return(MICs)
def calculate_mic(df, y): max_info = MINE() mics ={} for column in df.columns: max_info.compute_score(df.loc[:, column], y.values) mics[column] = max_info.mic() return pd.Series(mics)
def MIC_plot(self, x, y, numRows, numCols, plotNum, x_name, y_name, filename): # build the MIC and correlation plot using the covariant matrix using a vectorized implementation. To be used when # categorical features are part of the model (otherwise, Pearson, Kendall and Spearman can be used) print "Pearson product-moment correlation coefficients np.corrcoef(x=",x_name,", y=",y_name,"): ",np.corrcoef(x, y) r = np.around(np.corrcoef(x, y)[0, 1], 1) # Pearson product-moment correlation coefficients. # TODO: compute cov matrix for each one-hot encoding variable of the categorical feature with # MINE's Mutual Information coefficient fig = plt.figure(figsize=(33,5), frameon=True)#, ms=50) mine = MINE(alpha=0.6, c=15, est="mic_approx") mine.compute_score(x, y) mic = np.around(mine.mic(), 1) ax = plt.subplot(numRows, numCols, plotNum) ax.set_xlim(xmin=min(x)+1, xmax=max(x)+1) ax.set_ylim(ymin=min(y)+1, ymax=max(y)+1) ax.set_title('Pearson r=%.1f\nMIC=%.1f Features %s and %s in %s' % (r, mic, x_name, y_name, filename),fontsize=10) ax.set_frame_on(False) ax.axes.get_xaxis().set_visible(True) ax.axes.get_yaxis().set_visible(True) ax.plot(x, y, '*') plt.xlabel('X') plt.ylabel('Y') # ax.set_xticks([]) # ax.set_yticks([]) # plt.scatter(x,y,s=s) # plt.show() return ax
def mic(dataset: pd.DataFrame, labels: np.array) -> dict: score = {feature: None for feature in dataset} for feature, x in dataset.items(): mine = MINE() mine.compute_score(x.values.ravel(), labels) score[feature] = mine.mic() return score
def find_best_n_features_mic(n=8, out_path=''): # 计算MIC mine = MINE(alpha=0.6, c=15, est="mic_approx") mic_all = [] for i in range(x.shape[1]): xi = x[:, i] mine.compute_score(xi, y) mic_all.append(mine.mic()) # 找出8个最大的 best_n = [] best_n_mic = [] for i in range(n): best_position = np.nanargmax(mic_all) best_n.append(best_position) best_n_mic.append(copy.deepcopy(mic_all[best_position])) mic_all[best_position] = np.nan print('Found', n, 'features with largest MIC, whose positions are:') print(best_n) print() print('The MIC of these features are:') print(best_n_mic) print() best_features = x[:, best_n] print('Shape of features selected:', best_features.shape) best_features_with_label = pd.DataFrame( np.concatenate([best_features, y.reshape(len(y), 1)], axis=1)) out_path = out_path + 'mic_best_' + str(n) + '.csv' best_features_with_label.to_csv(out_path, header=None, index=None)
def get_mic(x, y): #get maximum information coefficient and pearson r value r = np.corrcoef(x, y)[0, 1] mine = MINE(alpha=0.4, c=15, est='mic_e') mine.compute_score(x, y) mic = mine.mic() return mic, r
def micfliter(data,rate): """ MIC feture selection function Arguments: data: Pandas DataFrame of rate: Float in range(0,1) Return: List of to drop """ m = MINE() micmatrix = [] for colx in data.columns: micResult = [] for coly in data.columns: m.compute_score(np.array(data[coly]), np.array(data[colx])) micResult.append(m.mic()) micmatrix.append(micResult) micmatrix = pd.DataFrame(micmatrix,columns=data.columns) upper = micmatrix.where(np.triu(np.ones(micmatrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column]>rate)] return to_drop
def mutual_infomation_rank(col_names, X, y, topK=10): ''' 互信息特征重要性检测 :param col_names: 特征名,list :param X: 特征矩阵,numpy 2D array :param y: 标签向量,numpy array :param topK: 输出前k个变量 :return: 排序后的特征dataframe,含权重和置信度 ''' # 因为互信息计算较慢,进行采样后再计算 original_size = len(y) sampling_size = 2000 if original_size > 2000 else original_size X, y = resample(X, y, random_state=0, n_samples=sampling_size) mine = MINE(alpha=0.6, c=15, est="mic_approx") scores = [] for i in range(0, len(col_names)): mine.compute_score(X[:, i], y) scores.append(mine.mic()) result_df = pd.DataFrame({'name': col_names, 'mutual_information': scores}) result_df = result_df[['name', 'mutual_information' ]].sort_values('mutual_information', ascending=False) print "size={m} sampling={s} features={n} top{k} rank for MINE testing:" \ .format(m=original_size, s=sampling_size, n=len(col_names), k=topK) print result_df.head(topK) return result_df
def mic_method(data_x, data_y, feat_labels, mic_threshold, is_split=1): # 缺失值填充 # data_x = data_x.fillna(data_x.mean()) data_x = data_x.fillna(0) data_x = data_x.values # 归一化,之前必须保证没有空值,之后自动变成ndarray scaler = MinMaxScaler() data_x = scaler.fit_transform(data_x) # dataframe变成没有标签的ndarray,以便可以输入模型 data_y = data_y.values if is_split == 1: # 先把onehot列单独拿出来 # onehot_data_x_left = data_x[:, :30] data_x_mid = data_x[:, 30:454] # onehot_data_x_right = data_x[:, 454:] else: data_x_mid = data_x # 最大信息系数法,注意,这个是针对分类问题使用的 # 选择K个最好的特征,返回选择特征后的数据 m = MINE() x = np.random.uniform(-1, 1, 10000) # m.compute_score(x, x ** 2) m.compute_score(data_x_mid, data_y) print(m.mic()) xxx = SelectKBest(chi2, k=mic_threshold).fit(data_x_mid, data_y) selected_data_x = SelectKBest(chi2, k=mic_threshold).fit_transform( data_x_mid, data_y) return selected_data_x, data_y
def mutual_information(self, X, Y, title=None, nbins_X=50, nbins_Y=50, noise_sigma='all'): #import pdb; pdb.set_trace() no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y))) Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X) Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y) s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X)) s.calculate_entropies() # MINE mine = MINE() mine.compute_score(X.flatten(), Y.flatten()) # Linear regression slope, intercept, r, p, stderr = \ scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx]) #import pdb; pdb.set_trace() if title is not None: print(title) print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" % (noise_sigma, mine.mic(), s.I(), r**2, p, slope))
def mic(points): points = np.transpose(points) mine = MINE() mine.compute_score(points[0], points[1]) Mic = mine.mic() del points return Mic
def MicEvaluate(dataX, dataY, name, pre_indices): ''' 计算每一个条件属性与决策属性之间的最大信息系数 :param dataX: :param dataY: :param name: :return: ''' dataY = dataY.reshape(1, -1)[0] nFeatures = len(dataX[0]) print("输入特征数为:", nFeatures) coorArray = [] * nFeatures mine = MINE(alpha=0.6, c=15) for i in range(0, nFeatures): l = [x[i] for x in dataX] mine.compute_score(l, dataY) temp = mine.mic() coorArray.append(abs(temp)) print("上一层留下的每个特征的最大互信息系数", coorArray) coorIndex = np.argsort(coorArray) coorIndex_ = [] #返回最初的特征索引 for i in coorIndex: coorIndex_.append(pre_indices[i]) coorArray = np.array(coorArray) print("MIC相关系数:") print("特征:", dict(zip(name[coorIndex_], coorArray[coorIndex]))) name_coorArray = dict(zip(name[coorIndex_], coorArray[coorIndex])) return coorIndex_, coorArray, name_coorArray
def calMIC(data): for i in range(5): mine = MINE(alpha=0.6, c=15) miles = data[data.veh == (i + 2)].iloc[:, 1] weight = data[data.veh == (i + 2)].iloc[:, 2] mine.compute_score(miles, weight) print("Without noise:", "MIC", mine.mic())
def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) : mic_map = {}; for dataFileName in dataFileArray : if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; if(neadNorm): _score_array =normizeDataSet(_score_array); #计算皮尔森相关系数 并输出成markdown形式 m = MINE() for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); if (neadNorm): dataArray = normizeDataSet(dataArray); m.compute_score(dataArray,_score_array); mic_map[headerArray[index]] = m.mic(); sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True); threhold = np.mean(list(mic_map.values())); for header,value in sorted_list: if value > threhold: print(header,value)
def get_correlation(dataset, target, features=set([])): if target is None: raise ValueError('corr() need target value') if not isinstance(dataset, pd.DataFrame): dataset = pd.DataFrame(dataset) if not features: features = set(dataset.columns) numerical = {} text = {} num_types = (np.dtype('float64'), np.dtype('int64'), np.dtype('bool')) target = dataset[target] mine = MINE() for col in features: if dataset.dtypes[col] in num_types: if dataset.dtypes[col] is np.dtype('bool'): dataset[col] = dataset[col].astype(int, copy=False) mine.compute_score(dataset[col], target) numerical[col] = mine.mic() else: text[col] = np.nan return { 'numerical': dict(sorted(numerical.items(), key=lambda d: d[1], reverse=True)), 'object': dict(sorted(text.items(), key=lambda d: d[1], reverse=True)) }
def compute_MIC(x, y, alpha=0.6, c=15, all_metrics=False): from minepy import MINE mine = MINE(alpha, c) mine.compute_score(x, y) if all_metrics: return mine.mic(), mine else: return mine.mic()
def MIC(self,x,y): mine = MINE(alpha=0.6,c=15,est="mic_approx") mine.compute_score(x,y) return mine.mic()
def mine(): for column in tqdm(uncorrelated, desc="Running MINE test", dynamic_ncols=True, leave=False): mine = MINE() mine.compute_score(epigenomes[column].values.ravel(), labels.values.ravel()) score = mine.mic() if score >= correlation_threshold: uncorrelated.remove(column)
def MIC(): train_x, train_y = MLdata('new_dataset.csv') mic = MINE() l = train_x.shape[1] print(l) for i in range(l): mic.compute_score(train_x[:,i], train_y) print(i, mic.mic())
def feature_scoring(X, Y): names = ["x%s" % i for i in range(1, 37)] ranks = {} X = X.values[:, :] lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) #stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) print('startMIC') mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:, i], Y) m = mine.mic() mic_scores.append(m) print(i) ranks["MIC"] = rank_to_dict(mic_scores, names) print('finish MIc') r = {} for name in names: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print("\t%s" % "\t".join(methods)) for name in names: print("%s\t%s" % (name, "\t".join( map(str, [ranks[method][name] for method in methods]))))
def mic(x, y): """ :param x: :param y: :return: """ m = MINE() m.compute_score(x, y) return (m.mic(), 0.5)
def mic_hq(X, y, cut=0.2): from minepy import MINE m = MINE() nf = X.shape[1] subs = np.array([False] * nf) for i in range(nf): m.compute_score(X[:,i], y) subs[i] = (m.mic() < cut) return(subs)
def _entropy_select(self): """ 互信息法 """ m = MINE() mic_array = np.zeros(self.sample_num) for i, x in enumerate(self.x.T): m.compute_score(x, self.y) mic_array[i] = m.mic() self._get_top_k_ids(mic_array)
def toolkit_mic(arr0, arr1, alpha=0.6, c=15): """MIC""" np_temp0 = np.array(arr0) np_temp1 = np.array(arr1) mine = MINE(alpha=0.6, c=15, est="mic_approx") mine.compute_score(np_temp0, np_temp1) return mine.mic()
def MIC(X, y): mics = [] for i in range(X.shape[1]): m = MINE() m.compute_score(X[:, i], y) mic = m.mic() mics += [ mic, ] return mics
def main(): """ """ #mine = MINE() #hsic_lasso = HSICLasso() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() root = 0 collist = [] for colname in glob.glob("./run*/COL*"): for line in open(colname): if "#" in line: continue line = line.split() t = float(line[0]) if t < 10000.0: continue elif 50000.0 < t: break collist.append(line) #break #MINEs, TICs = pstats(collist) #print(TICs) for i in range(len(collist[0])-1): for j in range(i+1, len(collist[0])-1): #for j in range(i+1, i + 5): miclist = [] ticlist = [] for _ in range(10): #if True: colpart = random.sample(collist, 10000) #colpart = collist x = np.array([a[i+1] for a in colpart], dtype=float) y = np.array([a[j+1] for a in colpart], dtype=float) #xy = np.array([x,y]) #mine = MINE() mine = MINE(est="mic_e") mine.compute_score(x,y) miclist.append(mine.mic()) ticlist.append(mine.tic()) #miclist = comm.gather(mine.mic(), root=0) #ticlist = comm.gather(mine.tic(), root=0) #hsic_lasso.input(xy, np.array([0,1])) #hsic_lasso.input(np.array([[1, 1, 1], [2, 2, 2]]), np.array([0, 1])) #hsic_lasso.regression(5) #hsic_lasso.classification(10) #print(hsic_lasso.dump()) if rank == root: print("%s,%s, %s, %s"%(i,j,np.mean(miclist),np.mean(ticlist)), flush = True) with open("./minedata.csv", "a") as wf: wf.write("%s, %s, %s, %s\n"%(i,j,np.mean(miclist),np.mean(ticlist)))
def mic (X, Y): new_X , new_Y = remove_pairs_with_a_missing(X, Y) try: import minepy from minepy import MINE except (ImportError): sys.exit("CRITICAL ERROR:2 Unable to import minepy package." + " Please check your install.") mine = MINE(alpha=0.6, c=15) mine.compute_score(new_X , new_Y) return mine.mic(), None
def MIC(features, labels): mine = MINE() mic_scores = [] labels = labels.flatten() for i in range(features.shape[1]): mine.compute_score(features[:, i], labels) m = mine.mic() mic_scores.append(m) return mic_scores