def chooseIndependantInputVariables(inArr): #print inArr selected_input_indexes = [] for i in range(inArr.shape[1]): doSelect = True for j in range(i): #Subrata for now choosing all inputs! commentout "break" later when you need it. #break # comment out this to select only independant inputs if(i == j): return x = inArr[:,i] y = inArr[:,j] #inputFeatureName1 = getInputParameterNameFromColumnIndex(i) inputFeatureName1 = getInputParameterNameFromFeatureIndex(i) #inputFeatureName2 = getInputParameterNameFromColumnIndex(j) inputFeatureName2 = getInputParameterNameFromFeatureIndex(j) #print "x: ", x x_scaled = preprocessing.scale(x) y_scaled = preprocessing.scale(y) #print "x: ", x_scaled #print "targetArr: ", targetArr mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, y_scaled) print "Correlation between ",inputFeatureName1,inputFeatureName2, " is ", mine.mic() if(float(mine.mic()) >= 0.99): doSelect = False print "\n ***** ==> will NOT select ", inputFeatureName1, " as it correlates with ", inputFeatureName2, "\n" #end for if(doSelect): selected_input_indexes.append(i) return selected_input_indexes
def McOne(data, label, r): print("McOne start...") classLabel = label dataMat = data.values n = data.shape[0] micFC = [0] * n Subset = [-1] * n numSubset = 0 for i in range(n): m = MINE() m.compute_score(dataMat[i], classLabel) micFC[i] = m.mic() if micFC[i] >= r: Subset[numSubset] = i numSubset += 1 Subset = Subset[:numSubset] Subset.sort(key=lambda x: micFC[x], reverse=True) e = 0 while e <= numSubset - 1: q = e + 1 while q <= numSubset - 1: m = MINE() m.compute_score(dataMat[Subset[e]], dataMat[Subset[q]]) if m.mic() >= micFC[Subset[q]]: for i in range(q, numSubset - 1): Subset[i] = Subset[i + 1] numSubset -= 1 else: q += 1 e += 1 return data.iloc[Subset[:numSubset]]
def compute_MIC(x, y, alpha=0.6, c=15, all_metrics=False): from minepy import MINE mine = MINE(alpha, c) mine.compute_score(x, y) if all_metrics: return mine.mic(), mine else: return mine.mic()
def fit(self,X,y): # initialize phi and feature set # if number of features is not set, half of the features will be selected n = self.n beta = self.beta verbose = self.verbose if n ==None: n = int(X.shape[0]/2) features = np.arange(X.shape[1]).tolist() best_mi = -np.inf X_hat = 0 for xi in features: m = MINE() m.compute_score(X[:,xi],y) #compute I(xi,y) and get max xi mi_xi_y = m.mic() if best_mi<mi_xi_y: X_hat = xi phi = [X_hat] features.remove(X_hat) # get paris for elements in phi and features while len(phi)<n: mi_scores = np.zeros(len(features)) for xi_idx,xi in enumerate(features): m = MINE() m.compute_score(X[:,xi],y) #compute I(xi,y) mi_xi_y = m.mic() sum_mi_xi_xj = 0 for xj in phi: # compute I(xi,xj) and save for further evaluation m = MINE() m.compute_score(X[:,xi],X[:,xj]) mi_xi_xj = m.mic() sum_mi_xi_xj+=mi_xi_xj mi_scores[xi_idx] = mi_xi_y - beta*sum_mi_xi_xj if verbose>=2: print "mi_scores for xi:{xi}, xj:{xj} is {mi_scores}".format(xi=xi,xj=xj,mi_scores=mi_scores[xi_idx]) X_hat = np.argmax(mi_scores) if verbose==1: print "X_hat is {X_hat}".format(X_hat=X_hat) X_hat = features[X_hat] phi.append(X_hat) features.remove(X_hat) self.phi = phi self.features = features
def calculate_mic(df, y): max_info = MINE() mics ={} for column in df.columns: max_info.compute_score(df.loc[:, column], y.values) mics[column] = max_info.mic() return pd.Series(mics)
def get_correlation(dataset, target, features=set([])): if target is None: raise ValueError('corr() need target value') if not isinstance(dataset, pd.DataFrame): dataset = pd.DataFrame(dataset) if not features: features = set(dataset.columns) numerical = {} text = {} num_types = (np.dtype('float64'), np.dtype('int64'), np.dtype('bool')) target = dataset[target] mine = MINE() for col in features: if dataset.dtypes[col] in num_types: if dataset.dtypes[col] is np.dtype('bool'): dataset[col] = dataset[col].astype(int, copy=False) mine.compute_score(dataset[col], target) numerical[col] = mine.mic() else: text[col] = np.nan return { 'numerical': dict(sorted(numerical.items(), key=lambda d: d[1], reverse=True)), 'object': dict(sorted(text.items(), key=lambda d: d[1], reverse=True)) }
def _evaluate_single(data, target_feature): mine = MINE(alpha=0.4, c=15) MICs = list() for i in range(data.shape[1]): mine.compute_score(target_feature,data[:,i]) MICs.append(mine.mic()) return(MICs)
def get_mic(x, y): #get maximum information coefficient and pearson r value r = np.corrcoef(x, y)[0, 1] mine = MINE(alpha=0.4, c=15, est='mic_e') mine.compute_score(x, y) mic = mine.mic() return mic, r
def _evaluate_single(data, target_feature): mine = MINE(alpha=0.3, c=15) MICs = list() for i in range(data.shape[1]): mine.compute_score(target_feature,data[:,i]) MICs.append(mine.mic()) return(MICs)
def MIC_plot(self, x, y, numRows, numCols, plotNum, x_name, y_name, filename): # build the MIC and correlation plot using the covariant matrix using a vectorized implementation. To be used when # categorical features are part of the model (otherwise, Pearson, Kendall and Spearman can be used) print "Pearson product-moment correlation coefficients np.corrcoef(x=",x_name,", y=",y_name,"): ",np.corrcoef(x, y) r = np.around(np.corrcoef(x, y)[0, 1], 1) # Pearson product-moment correlation coefficients. # TODO: compute cov matrix for each one-hot encoding variable of the categorical feature with # MINE's Mutual Information coefficient fig = plt.figure(figsize=(33,5), frameon=True)#, ms=50) mine = MINE(alpha=0.6, c=15, est="mic_approx") mine.compute_score(x, y) mic = np.around(mine.mic(), 1) ax = plt.subplot(numRows, numCols, plotNum) ax.set_xlim(xmin=min(x)+1, xmax=max(x)+1) ax.set_ylim(ymin=min(y)+1, ymax=max(y)+1) ax.set_title('Pearson r=%.1f\nMIC=%.1f Features %s and %s in %s' % (r, mic, x_name, y_name, filename),fontsize=10) ax.set_frame_on(False) ax.axes.get_xaxis().set_visible(True) ax.axes.get_yaxis().set_visible(True) ax.plot(x, y, '*') plt.xlabel('X') plt.ylabel('Y') # ax.set_xticks([]) # ax.set_yticks([]) # plt.scatter(x,y,s=s) # plt.show() return ax
def calMIC(data): for i in range(5): mine = MINE(alpha=0.6, c=15) miles = data[data.veh == (i + 2)].iloc[:, 1] weight = data[data.veh == (i + 2)].iloc[:, 2] mine.compute_score(miles, weight) print("Without noise:", "MIC", mine.mic())
def mic(dataset: pd.DataFrame, labels: np.array) -> dict: score = {feature: None for feature in dataset} for feature, x in dataset.items(): mine = MINE() mine.compute_score(x.values.ravel(), labels) score[feature] = mine.mic() return score
def calculateCorrelationBetweenVectors(x,y): #x = scipy.array([-0.65499887, 2.34644428, 3.0]) #y = scipy.array([-1.46049758, 3.86537321, 21.0]) #The Pearson correlation coefficient measures the linear relationship between two datasets. #Strictly speaking, Pearson correlation requires that each dataset be normally distributed. #correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. #Correlations of -1 or +1 imply an exact linear relationship. #The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. #The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. #print "X = " , x, "\nY = ", y #corr, p_value = pearsonr(x, y) commonSize = 0 if(len(x) < len(y)): commonSize = len(x) else: commonSize = len(y) x_sorted = np.sort(x) y_sorted = np.sort(y) x_sorted = x_sorted[ : (commonSize - 1)] y_sorted = y_sorted[ : (commonSize - 1)] x_scaled = preprocessing.scale(x_sorted) y_scaled = preprocessing.scale(y_sorted) mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, y_scaled) corr = float(mine.mic()) #return #print "correlation :", corr return corr
def performMIC(transposed_list, cutoff, p): mic_scores = [] for counter1 in range(0, len(transposed_list) - 1): for counter2 in range(counter1 + 1, len(transposed_list)): mine = MINE(alpha=0.6, c=15) mine.compute_score(transposed_list[counter1], transposed_list[counter2]) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p + '_' + str(counter1 + 1) mic_score['y'] = p + '_' + str(counter2 + 1) mic_score['p1'] = p mic_score['p2'] = p mic_score['weight'] = format(mine.mic(), '.3f') mic_scores.append(mic_score) return mic_scores
def perform_mic_1p(p_sequences, p, cutoff=0.5, out_folder=''): p_sequences_t = transpose(array([list(z) for z in p_sequences])).tolist() mic_scores = [] for counter1 in range(0, len(p_sequences_t) - 1): for counter2 in range(counter1 + 1, len(p_sequences_t)): mine = MINE(alpha=0.6, c=15) mine.compute_score(p_sequences_t[counter1], p_sequences_t[counter2]) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p+'_'+str(counter1+1) mic_score['y'] = p+'_'+str(counter2+1) mic_score['p1'] = p mic_score['p2'] = p mic_score['weight'] = format(mine.mic(), '.3f') mic_scores.append(mic_score) write_mics_to_csv(mics=mic_scores, p1=p, p2=p, cutoff=cutoff, out_folder=out_folder) return mic_scores
def feature_scoring(X, Y): names = ["x%s" % i for i in range(1, 37)] ranks = {} X = X.values[:, :] lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) #stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) print('startMIC') mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:, i], Y) m = mine.mic() mic_scores.append(m) print(i) ranks["MIC"] = rank_to_dict(mic_scores, names) print('finish MIc') r = {} for name in names: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print("\t%s" % "\t".join(methods)) for name in names: print("%s\t%s" % (name, "\t".join( map(str, [ranks[method][name] for method in methods]))))
def MIC(name): # pip install minepy fileN = db.searchFile(name) isfile = fileN[0]['filename'] # print(fileN) if isfile == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) if len(fileN) < 2: ret = {"route": '需要上传两个文件以进行MIC计算'} return json.dumps(ret) x = [] y = [] file1 = fileN[0]['filename'] file2 = fileN[1]['filename'] csvFile1 = open(name + '/' + file1, encoding='utf-8-sig') csv_file1 = csv.reader(csvFile1) for content in csv_file1: print(content) content = list(map(float, content)) if len(content) != 0: x.append(float(content[0])) csvFile1.close() print('x=', x) csvFile2 = open(name + '/' + file2, encoding='utf-8-sig') csv_file2 = csv.reader(csvFile2) for content in csv_file2: content = list(map(float, content)) if len(content) != 0: y.append(float(content[0])) csvFile2.close() print('y=', y) mine = MINE(alpha=0.6, c=15) mine.compute_score(x, y) print("MIC", mine.mic()) #将MIC值写入文件 with open(name + '/' + 'MIC_result.csv', 'w', newline='') as new_file: csv_writer = csv.writer(new_file) csv_writer.writerow(["MIC result"]) data = [] data.append(str(mine.mic())) csv_writer.writerow(data) ret = {"route": 'MIC_result.csv'} return json.dumps(ret)
def mic(x, y): """ :param x: :param y: :return: """ m = MINE() m.compute_score(x, y) return (m.mic(), 0.5)
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5): mic_scores = [] p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist() p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist() for idx1, record1 in enumerate(p1_sequences_t): for idx2, record2 in enumerate(p2_sequences_t): mine = MINE(alpha=0.6, c=15) mine.compute_score(record1, record2) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p1+'_'+str(idx1+1) mic_score['y'] = p2+'_'+str(idx2+1) mic_score['p1'] = p1 mic_score['p2'] = p2 mic_score['weight'] = format(mine.mic(), '.3f') mic_scores.append(mic_score) write_mics_to_csv(mics=mic_scores, p1=p1, p2=p2, cutoff=cutoff) return mic_scores
def execute(self, symbol): """ :param symbol: the symbol in which we are looking for correlations :type symbol: :class:`netzob.Common.Models.Vocabulary.AbstractField.AbstractField` """ (attributeValues_headers, attributeValues) = self._generateAttributeValuesForSymbol(symbol) symbolResults = [] # MINE computation of each field's combination for i, values_x in enumerate(attributeValues[:-1]): for j, values_y in enumerate(attributeValues[i + 1 :]): mine = MINE(alpha=0.6, c=15) mine.compute_score(numpy.array(values_x), numpy.array(values_y)) mic = round(mine.mic(), 2) if mic > float(self.minMic): # We add the relation to the results (x_fields, x_attribute) = attributeValues_headers[i] (y_fields, y_attribute) = attributeValues_headers[j] # The relation should not apply on the same field if len(x_fields) == 1 and len(y_fields) == 1 and x_fields[0].id == y_fields[0].id: continue pearson = numpy.corrcoef(values_x, values_y)[0, 1] if not numpy.isnan(pearson): pearson = round(pearson, 2) relation_type = self._findRelationType(x_attribute, y_attribute) self._debug_mine_stats(mine) self._logger.debug( "Correlation found between '" + str(x_fields) + ":" + x_attribute + "' and '" + str(y_fields) + ":" + y_attribute + "'" ) self._logger.debug(" MIC score: " + str(mic)) self._logger.debug(" Pearson score: " + str(pearson)) id_relation = str(uuid.uuid4()) symbolResults.append( { "id": id_relation, "relation_type": relation_type, "x_fields": x_fields, "x_attribute": x_attribute, "y_fields": y_fields, "y_attribute": y_attribute, "mic": mic, "pearson": pearson, } ) return symbolResults
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5): mic_scores = [] p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist() p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist() for idx1, record1 in enumerate(p1_sequences_t): for idx2, record2 in enumerate(p2_sequences_t): mine = MINE(alpha=0.6, c=15) mine.compute_score(record1, record2) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p1+'_'+str(idx1+1) mic_score['y'] = p2+'_'+str(idx2+1) mic_score['p1'] = p1 mic_score['p2'] = p2 mic_score['weight'] = mine.mic() mic_scores.append(mic_score) #print('computed ', len(mic_scores), ' mics for ', p1, p2, 'for cutoff ', cutoff) return mic_scores
def mine_features(data,features): print '...' for X_hat_idx in features: features.remove(X_hat_idx) subset = features for xi_idx in subset: m = MINE() X_hat = data[X_hat_idx].values xi = data[xi_idx].values m.compute_score(X_hat,xi) I_X_hat_xi = m.mic() if I_X_hat_xi>0.10: print 'I({X_hat_idx},{xi_idx}): {I_X_hat_xi}'.format(X_hat_idx=X_hat_idx,xi_idx=xi_idx,I_X_hat_xi=I_X_hat_xi)
def calcMICReg(df,target,col): """ """ m=MINE() if df[col].dtype.name=="category": g=df.groupby(by=[col])['_target_variable_'].mean() g=g.to_dict() X=df[col].values X=[g[x] for x in X] else: X=df[col].values m.compute_score(X, target) return {col:m.mic()}
def mysubplot(x, y, numRows, numCols, plotNum, xlim=(-4, 4), ylim=(-4, 4)): r = np.around(np.corrcoef(x, y)[0, 1], 1) mine = MINE(alpha=0.6, c=15) mine.compute_score(x, y) mic = np.around(mine.mic(), 1) ax = plt.subplot(numRows, numCols, plotNum, xlim=xlim, ylim=ylim) ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10) ax.set_frame_on(False) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) ax.plot(x, y, ',') ax.set_xticks([]) ax.set_yticks([]) return ax
def select_feature(self, data, label, threshold=0.7): """ Perform feature selection by maximum information coefficient that can capture both linear and non-linear relationships. """ selected = [] from minepy import MINE mine = MINE() for i, col in enumerate(data): print 'feature selection: %d/%d %s' % (i, data.shape[1], col) mine.compute_score(data[col], label) if mine.mic() > threshold: selected.append(col) print '%d out of %d features were selected' % (len(selected), data.shape[1]) return selected
def get_corrcoef(X): div = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.05, random_state=0) for train, test in div: X = X[np.array(test)] break X = X.transpose() pcc = np.ones((X.shape[0], X.shape[0])) m = MINE() # feat_groups = [[0], [1, 2, 3], [4, 5, 7, 8, 9, 10], [6], # list(range(11, 24)), list(range(24, 29)), list(range(29, 34))] t = time() for i in range(0, 1): for j in range(1, 20): m.compute_score(X[i], X[j]) pcc[i, j] = pcc[j, i] = m.mic() # np.corrcoef(X[i], X[j])[0, 1] print(i, j, pcc[i, j], time()-t) np.savetxt(os.path.join(CODE_PATH, 'feat_sim_pcc_2.csv'), pcc, fmt='%.3f', delimiter=',') print('Done with computing PCC,', 'using', time()-t, 's')
def mutual_information(self, X, Y, title=None, nbins_X=50, nbins_Y=50, noise_sigma='all'): #import pdb; pdb.set_trace() no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y))) Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X) Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y) s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X)) s.calculate_entropies() # MINE mine = MINE() mine.compute_score(X.flatten(), Y.flatten()) # Linear regression slope, intercept, r, p, stderr = \ scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx]) #import pdb; pdb.set_trace() if title is not None: print(title) print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" % (noise_sigma, mine.mic(), s.I(), r**2, p, slope))
print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size)) #明显缺陷:作为特征排序机制,他只对线性关系敏感.即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0 a = np.random.uniform(-1, 1, 100000) #uniform(low,high,size) 随机数 print pearsonr(a, a**2)[0] #1.2 互信息和最大信息系数 (Mutual information and maximal information),[0,1] #互信息直接用于特征选择不太方便,最大信息系数首先寻找一种最优的离散化方式, #然后把互信息取值转换成一种度量方式,取值区间在[0,1]。minepy提供了MIC功能。 from minepy import MINE # m = MINE() x = np.random.uniform(-1, 1, 10000) m.compute_score(x, x**2) print m.mic() #1.3 距离相关系数 (Distance correlation),[0,1] #距离相关系数是为了克服Pearson相关系数的弱点而生的。在x和x^2这个例子中,即便Pearson相关系数是0, #我们也不能断定这两个变量是独立的(有可能是非线性相关);但如果距离相关系数是0,那么我们就可以说这两个变量是独立的。 import numpy as np def dist(x, y): #1d only return np.abs(x[:, None] - y) def d_n(x): d = dist(x, x) dn = d - d.mean(0) - d.mean(1)[:,None] + d.mean()
mic_approx_null, mic_e_null, tic_e_null, r2_null = [], [], [], [] mic_approx_alt, mic_e_alt, tic_e_alt, r2_alt = [], [], [], [] # null hypothesis for k in range(1, n_null+1): x = np.random.rand(n) r = np.random.randn(n) y = f() # resimulate x for the null scenario x = np.random.rand(n) mine_approx.compute_score(x, y) mine_e.compute_score(x, y) mic_approx_null.append(mine_approx.mic()) mic_e_null.append(mine_e.mic()) tic_e_null.append(mine_e.tic()) r2_null.append(np.corrcoef(x, y)[0][1]**2) # alternative hypothesis for k in range(1, n_alt+1): x = np.random.rand(n) r = np.random.randn(n) y = f() mine_approx.compute_score(x, y) mine_e.compute_score(x, y) mic_approx_alt.append(mine_approx.mic()) mic_e_alt.append(mine_e.mic())
def train_and_analyse(_X, _y, sno, ino): X = _X.copy() Y = _y features = X.columns.values cv_l = cross_validation.KFold(X.shape[0], n_folds=5, shuffle=True, random_state=1) ranks_linear = {} ranks_nonlinear= {} ranks_path = {} ranks = {} selection_feature = [] time_feature_1 = [ 'date2j' ] time_feature_2 = [ 'day', 'month', 'year' ] time_feature_3 = [ 'is_2012', 'is_2013', 'is_2014', 'fall', 'winter', 'spring', 'summer' ] time_feature_4 = [ 'weekday', 'is_weekend', 'is_holiday', 'is_holiday_weekday', 'is_holiday_weekend', ] time_feature_5 = [ 'MemorialDay', 'MothersDay', 'BlackFridayM3', 'BlackFriday1', 'NewYearsDay', 'IndependenceDay', 'VeteransDay', 'BlackFriday2', 'NewYearsEve', 'BlackFriday3', 'ChristmasDay', 'BlackFridayM2', 'ThanksgivingDay', 'Halloween', 'EasterSunday', 'ChristmasEve', 'ValentinesDay', 'PresidentsDay', 'ColumbusDay', 'MartinLutherKingDay', 'LaborDay', 'FathersDay', 'BlackFriday' ] weather_feature = [ 'high_precip', 'preciptotal', 'snowfall', 'high_snow', 'avgspeed', 'windy', 'temp_missing', 'tavg', 'hot', 'cold', 'frigid', 'thunder', 'snowcode', 'raincode' ] temp = time_feature_1 + time_feature_2 + time_feature_3 + time_feature_4 + time_feature_5 X_f1 = X[temp].values # lr = LinearRegression(normalize=True) # lr.fit(X, Y) # ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) f, pval = f_regression(ut.get_processed_X_A(X_f1), Y, center=True) ranks["F_regr"] = pd.Series(rank_to_dict(np.nan_to_num(f), temp)) # print('asd') # mi = mutual_info_regression(ut.get_processed_X_A(X_f1), Y) # mi /= np.max(mi) # ranks['MI'] = Pd.Series() mine = MINE() mic_scores = [] for i in range(ut.get_processed_X_A(X_f1).shape[1]): mine.compute_score(ut.get_processed_X_A(X_f1)[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = pd.Series(rank_to_dict(mic_scores, temp)) # ridge.fit(X, Y) # ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model # rlasso = RandomizedLasso(alpha='bic', normalize=True) # rlasso.fit(X_f1, Y) # ranks_linear["Stability"] = pd.Series(rlasso.scores_) # alpha_grid, scores_path = lasso_stability_path(X_f1, Y, random_state=42, # eps=0.00005, n_grid=500) # for alpha, score in zip(alpha_grid, scores_path.T): # ranks_path[alpha] = score # ranks_path = pd.DataFrame(ranks_path).transpose() # ranks_path.columns = temp # plt.figure() # ranks_path.plot() # plt.show() # selection_feature.extend(ranks_linear[ranks_linear.F_regr > 0.1].index.values.tolist()) # selection_feature.extend(ranks_linear[ranks_linear.MIC > 0.1].index.values.tolist()) # selection_feature.extend(ranks_linear[ranks_linear.Stability > 0.1].index.values.tolist()) #------------------------------- # rf = RandomForestRegressor(n_estimators=150, max_depth=4, n_jobs=4, random_state=1) rf = ut.get_regression_model('RandomForest', 0) scores = [] for i in range(X_f1.shape[1]): score = cross_val_score(rf, X_f1[:, i:i+1].astype(float), Y, scoring="r2", cv=ShuffleSplit(len(X_f1), 3, .3), n_jobs=2) scores.append(round(np.mean(score), 3)) ranks['RF'] = pd.Series(rank_to_dict(np.abs(scores), temp)) ranks = pd.DataFrame(ranks) print(ranks) selection_feature.extend(ranks[ranks.RF > 0.1].index.values.tolist()) selection_feature.extend(ranks[ranks.MIC >= 0.1].index.values.tolist()) selection_feature.extend(ranks[ranks.F_regr >= 0.1].index.values.tolist()) #------------------------------- selection_feature = list(set(selection_feature)) print(selection_feature) # ridge = RidgeCV(cv=cv_l) # rfe = RFE(ridge, n_features_to_select=1) # rfe.fit(X[selection_feature],Y) # ranks["RFE"] = pd.Series(rank_to_dict(np.array(rfe.ranking_).astype(float), selection_feature, order=1)) # ranks = pd.DataFrame(ranks) # print(ranks) # r = {} # for name in features: # r[name] = round(np.mean([ranks[method][name] # for method in ranks.keys()]), 2) # methods = sorted(ranks.keys()) # ranks["Mean"] = r # methods.append("Mean") path = 'Analyse/store_{}/'.format(sno) mkdir_p(path) path += 'item_{}_(pair_analyse)'.format(ino) ranks.to_pickle(path) path += '.png' p.clf() p.cla() plt.figure(figsize=(16, 26)) ranks.plot.barh(stacked=True) p.savefig(path, bbox_inches='tight', dpi=300) plt.close() return ranks, selection_feature
def mic(x, y): m = MINE() m.compute_score(x, y) return (m.mic(), 0.5)
def interactionV(self, data): from minepy import MINE m = MINE() m.compute_score(data, x**2) print(m.mic())
# mine.compute_score(X_Standard_T[i], X_Standard_T[10]) # mics.append(mine.mic()) # print i, mine.mic() # # for i in range(0,38): # # mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[38]) # # mics.append(mine.mic()) # # print i, mine.mic() # for i in range(0,7): # mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[7]) # mics.append(mine.mic()) # print i, mine.mic() # for i in range(48): mine.compute_score(X_ALL_Standard_T[i], X_ALL_Standard_T[48]) mics.append(mine.mic()) names = [] for c in allDF.columns.values: names.append(c) map = {} for i in range(48): map[names[i]] = mics[i] import operator sorted_tuple = sorted(map.items(), key=operator.itemgetter(1)) vs = [] ks = [] for k,v in sorted_tuple: ks.append(k); vs.append(v)
class TestFunctions(unittest.TestCase): def setUp(self): self.mine = MINE(alpha=0.6, c=15) def build_const(self, n): x = np.linspace(0, 1, n) y = np.zeros(n) return x, y def build_linear(self, n): x = np.linspace(0, 1, n) return x, x def build_sine(self, n): x = np.linspace(0, 1, n) return x, np.sin(8*np.pi*x) def build_exp(self, n): x = np.linspace(0, 10, n) return x, 2**x def test_const(self): x, y = self.build_const(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 0., 4) assert_almost_equal(self.mine.mas(), 0., 4) assert_almost_equal(self.mine.mev(), 0., 4) assert_almost_equal(self.mine.mcn(), 2., 4) assert_almost_equal(self.mine.mcn_general(), 2., 4) def test_linear(self): x, y = self.build_linear(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 1., 4) assert_almost_equal(self.mine.mas(), 0., 4) assert_almost_equal(self.mine.mev(), 1., 4) assert_almost_equal(self.mine.mcn(), 2., 4) assert_almost_equal(self.mine.mcn_general(), 2., 4) def test_linear(self): x, y = self.build_linear(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 1., 4) assert_almost_equal(self.mine.mas(), 0., 4) assert_almost_equal(self.mine.mev(), 1., 4) assert_almost_equal(self.mine.mcn(), 2., 4) assert_almost_equal(self.mine.mcn_general(), 2., 4) def test_sine(self): x, y = self.build_sine(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 1., 4) assert_almost_equal(self.mine.mas(), 0.875, 3) assert_almost_equal(self.mine.mev(), 1., 4) assert_almost_equal(self.mine.mcn(), 4., 4) assert_almost_equal(self.mine.mcn_general(), 4., 4) def test_exp(self): x, y = self.build_exp(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 1., 4) assert_almost_equal(self.mine.mas(), 0., 4) assert_almost_equal(self.mine.mev(), 1., 4) assert_almost_equal(self.mine.mcn(), 2., 4) assert_almost_equal(self.mine.mcn_general(), 2., 4)
Mdim=len(f1) Mat=np.zeros((Mdim, Mdim)) # ============================================================================= # # Compute MIC, PCC, KTau, NMIS Algorithm # & Generate Correlation Matrix # # ============================================================================= print 'Computing mutual information indices and generating correlation matrix...' for i in range(Mdim): for j in range(Mdim): if mla == 'MIC': mine.compute_score(f1[i],f2[j]) Mat[i][j] = mine.mic() elif mla == 'PCC': Mat[i][j] = pearsonr(f1[i],f2[j])[0] elif mla == 'KTau': Mat[i][j] = kendalltau(f1[i], f2[j])[0] elif mla == 'NMIS': Mat[i][j] = normalized_mutual_info_score(f1[i],f2[j]) sys.stdout.write(".") g=open(output_dir+'/'+'CorrMatrix_'+mla+'_'+str(Mdim)+'_'+str(GPS)+'_'+nfilename+'.txt','a') if j==Mdim-1: g.write(str(Mat[i][j])) g.write('\n') else: g.write(str(Mat[i][j])) g.write(' ') g.close()
def doMICAnalysisOfInputVariables(inArr, targetArr,targetName, mic_score_threshold,input_indexes_uncorrelated_features,targetQualityMap = None): #if(targetQuality == None): # return inArr #print inArr #global inputColumnNameToIndexMapFromFile #global measuredColumnNameToIndexMapFromFile #global outputColumnNameToIndexMapFromFile #print "\n\n\n doMICAnalysisOfInputVariables called \n\n" goodTargetMap = getGlobalObject("goodTargetMap") selected_inArr = [] selected_inArr_indexes = [] selected_originalColumn_indexes = [] inColMap = getGlobalObject("inputColumnIndexToNameMapFromFile") #keys are col index and vals are names #selected_inArr.append([]) #print "doMICAnalysisOfInputVariables: ", "inArr.shape: ", inArr.shape #print "doMICAnalysisOfInputVariables: ", "targetArr.shape: ", targetArr.shape numOfFeatures = 0 try: #(rows,numOfFeatures) = inArr.shape numOfFeatures = inArr.shape[1] except: print "ERROR: \n", inArr exit(0) k = 0 for featureIndex in range(numOfFeatures): #for i in inColMap.keys(): #x = inArr[:,i] #x = inArr[:,k] # we will choose only uncorrelated features as input if(featureIndex not in input_indexes_uncorrelated_features): continue x = inArr[:,featureIndex] #print "x: ", x x_scaled = preprocessing.scale(x) #print "x: ", x_scaled #print "targetArr: ", targetArr mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, targetArr) #print getGlobalObject("inputColumnNameToIndexMapFromFile") #inputFeatureName = getGlobalObject("inputColumnNameToIndexMapFromFile")[i] #inputFeatureName = inColMap[i] #inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex) #inputFeatureName = getInputParameterNameFromColumnIndex(featureIndex) inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex) print_stats(mine,inputFeatureName,targetName,mic_score_threshold) if(targetQualityMap != None): targetQualityMap.append(float(mine.mic())) #l = list(x) #selected_inArr = np.concatenate((selected_inArr, np.array(l)), axis=0) #print k #print mine.mic() if(float(mine.mic()) >= mic_score_threshold): selected_inArr.append(x) #keep the input data column selected_inArr_indexes.append(k) #keep the index corresponding to that column colIdx = getColumnIndexFromFeatureIndex(featureIndex) selected_originalColumn_indexes.append(colIdx) #keep the original column index corresponding to that column #now add the target itself to goodTargetMap. For anomaly detection we will only use these targets goodTargetMap[targetName] = True print "----------------- selected: ", inputFeatureName, colIdx, k k = k + 1 selected_inArr = np.array(selected_inArr).transpose() #print "\n **** selected: ==== \n", selected_inArr, selected_inArr_indexes,selected_originalColumn_indexes return selected_inArr, selected_inArr_indexes, selected_originalColumn_indexes
#RandomForestRegressor rf = RandomForestRegressor() rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) #f_regression f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) #MINE mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, names) #----statistics--out--------- r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print "\t%s" % "\t".join(methods)
def mic(x, y): m = MINE() print x print y m.compute_score(x, y) return (m.mic(), 0.5)
def train_and_analyse(_X, _y, features): X = _X Y = _y cv_l = cross_validation.KFold(X.shape[0], n_folds=10, shuffle=True, random_state=1) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) ridge = RidgeCV(cv=cv_l) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features) rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features) rfe = RFE(lr, n_features_to_select=1) rfe.fit(X,Y) ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1) rf = RandomForestRegressor(n_estimators=500) rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, features) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, features) r = {} for name in features: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") ranks = pd.DataFrame(ranks) selection_feature = ranks[ranks.Mean > 0.12].index.values return ranks, selection_feature
res.append(pd.read_csv("./avg_xgbs_discret_feature_5.csv").score.values) res.append(pd.read_csv("./R_7199.csv").score.values) res.append(pd.read_csv("./rank_feature_xgb_ensemble.csv").score.values) res.append(pd.read_csv("./avg_xgbs_discret_feature_10.csv").score.values) res.append(pd.read_csv("./based_on_select_rank_feature.csv").score.values) res.append(pd.read_csv("./xgb717.csv").score.values) res.append(pd.read_csv("./725.csv").score.values) res.append(pd.read_csv("./svm6938.csv").score.values) cm = [] for i in range(8): tmp = [] for j in range(8): m = MINE() m.compute_score(res[i], res[j]) tmp.append(m.mic()) cm.append(tmp) import numpy as np import matplotlib.pyplot as plt def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues): plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(8) plt.xticks(tick_marks, fs, rotation=45) plt.yticks(tick_marks, fs) plt.tight_layout()
for j, f in enumerate(ff): mic_null, gmic_null, r2_null = [], [], [] mic_alt, gmic_alt, r2_alt = [], [], [] # null hypothesis for k in range(1, n_null+1): print i, j, k x = np.random.rand(n) r = np.random.randn(n) y = f() # resimulate x for the null scenario x = np.random.rand(n) mine.compute_score(x, y) mic_null.append(mine.mic()) gmic_null.append(mine.gmic(p=-1)) r2_null.append(np.corrcoef(x, y)[0][1]**2) # alternative hypothesis for k in range(1, n_alt+1): x = np.random.rand(n) r = np.random.randn(n) y = f() mine.compute_score(x, y) mic_alt.append(mine.mic()) gmic_alt.append(mine.gmic(p=-1)) r2_alt.append(np.corrcoef(x, y)[0][1]**2) cut_mic = np.percentile(mic_null, 95)
def get_mic(self): m = MINE() m.compute_score(self.x, self.y) return m.mic()
def performMIC(transposed_list): mic_scores=[] for counter1 in range(0, len(transposed_list)-1): for counter2 in range(counter1+1, len(transposed_list)): mine = MINE(alpha=0.6, c=15) mine.compute_score(transposed_list[counter1], transposed_list[counter2]) if (mine.mic()>0.6): mic_score={} mic_score['x']=counter1 mic_score['y']=counter2 mic_score['mic']=mine.mic() mic_scores.append(mic_score) return mic_scores