Ejemplo n.º 1
0
    def mutual_information(self,
                           X,
                           Y,
                           title=None,
                           nbins_X=50,
                           nbins_Y=50,
                           noise_sigma='all'):
        #import pdb; pdb.set_trace()
        no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y)))
        Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X)
        Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y)
        s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X))
        s.calculate_entropies()

        # MINE
        mine = MINE()
        mine.compute_score(X.flatten(), Y.flatten())

        # Linear regression
        slope, intercept, r, p, stderr = \
                scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx])

        #import pdb; pdb.set_trace()
        if title is not None:
            print(title)
        print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" %
              (noise_sigma, mine.mic(), s.I(), r**2, p, slope))
Ejemplo n.º 2
0
def mic_method(data_x, data_y, feat_labels, mic_threshold, is_split=1):
    # 缺失值填充
    # data_x = data_x.fillna(data_x.mean())
    data_x = data_x.fillna(0)
    data_x = data_x.values
    # 归一化,之前必须保证没有空值,之后自动变成ndarray
    scaler = MinMaxScaler()
    data_x = scaler.fit_transform(data_x)
    # dataframe变成没有标签的ndarray,以便可以输入模型
    data_y = data_y.values

    if is_split == 1:
        # 先把onehot列单独拿出来
        # onehot_data_x_left = data_x[:, :30]
        data_x_mid = data_x[:, 30:454]
        # onehot_data_x_right = data_x[:, 454:]
    else:
        data_x_mid = data_x

    # 最大信息系数法,注意,这个是针对分类问题使用的
    # 选择K个最好的特征,返回选择特征后的数据
    m = MINE()
    x = np.random.uniform(-1, 1, 10000)
    # m.compute_score(x, x ** 2)
    m.compute_score(data_x_mid, data_y)
    print(m.mic())
    xxx = SelectKBest(chi2, k=mic_threshold).fit(data_x_mid, data_y)
    selected_data_x = SelectKBest(chi2, k=mic_threshold).fit_transform(
        data_x_mid, data_y)
    return selected_data_x, data_y
Ejemplo n.º 3
0
def _evaluate_single(data, target_feature):
    mine = MINE(alpha=0.4, c=15)
    MICs = list()
    for i in range(data.shape[1]):
        mine.compute_score(target_feature,data[:,i])
        MICs.append(mine.mic())
    return(MICs)
Ejemplo n.º 4
0
def find_best_n_features_mic(n=8, out_path=''):
    # 计算MIC
    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mic_all = []
    for i in range(x.shape[1]):
        xi = x[:, i]
        mine.compute_score(xi, y)
        mic_all.append(mine.mic())

    # 找出8个最大的
    best_n = []
    best_n_mic = []
    for i in range(n):
        best_position = np.nanargmax(mic_all)
        best_n.append(best_position)
        best_n_mic.append(copy.deepcopy(mic_all[best_position]))
        mic_all[best_position] = np.nan

    print('Found', n, 'features with largest MIC, whose positions are:')
    print(best_n)
    print()
    print('The MIC of these features are:')
    print(best_n_mic)
    print()

    best_features = x[:, best_n]
    print('Shape of features selected:', best_features.shape)
    best_features_with_label = pd.DataFrame(
        np.concatenate([best_features, y.reshape(len(y), 1)], axis=1))

    out_path = out_path + 'mic_best_' + str(n) + '.csv'
    best_features_with_label.to_csv(out_path, header=None, index=None)
Ejemplo n.º 5
0
def chooseIndependantInputVariables(inArr):
	#print inArr
	selected_input_indexes = []
	for i in range(inArr.shape[1]):
		doSelect = True
		for j in range(i):

			#Subrata for now choosing all inputs! commentout "break" later when you need it.
			#break  # comment out this to select only independant inputs

			if(i == j):
				return
			x = inArr[:,i]
			y = inArr[:,j]
			#inputFeatureName1 = getInputParameterNameFromColumnIndex(i)
			inputFeatureName1 = getInputParameterNameFromFeatureIndex(i)
			#inputFeatureName2 = getInputParameterNameFromColumnIndex(j)
			inputFeatureName2 = getInputParameterNameFromFeatureIndex(j)
                	#print "x: ", x
                	x_scaled = preprocessing.scale(x)
                	y_scaled = preprocessing.scale(y)
                	#print "x: ", x_scaled
                	#print "targetArr: ", targetArr 
                	mine = MINE(alpha=0.6, c=15)
                	mine.compute_score(x_scaled, y_scaled)
			print "Correlation between ",inputFeatureName1,inputFeatureName2, " is ", mine.mic()  
			if(float(mine.mic()) >= 0.99):
				doSelect = False
				print "\n ***** ==> will NOT select ", inputFeatureName1, " as it correlates with ", inputFeatureName2, "\n" 
		#end for
		if(doSelect):
			selected_input_indexes.append(i)

	return selected_input_indexes
Ejemplo n.º 6
0
def get_correlation(dataset, target, features=set([])):
    if target is None:
        raise ValueError('corr() need target value')
    if not isinstance(dataset, pd.DataFrame):
        dataset = pd.DataFrame(dataset)
    if not features:
        features = set(dataset.columns)
    numerical = {}
    text = {}
    num_types = (np.dtype('float64'), np.dtype('int64'), np.dtype('bool'))
    target = dataset[target]
    mine = MINE()
    for col in features:
        if dataset.dtypes[col] in num_types:
            if dataset.dtypes[col] is np.dtype('bool'):
                dataset[col] = dataset[col].astype(int, copy=False)
            mine.compute_score(dataset[col], target)
            numerical[col] = mine.mic()
        else:
            text[col] = np.nan
    return {
        'numerical':
        dict(sorted(numerical.items(), key=lambda d: d[1], reverse=True)),
        'object':
        dict(sorted(text.items(), key=lambda d: d[1], reverse=True))
    }
Ejemplo n.º 7
0
def get_mic(x, y):
    #get maximum information coefficient and pearson r value
    r = np.corrcoef(x, y)[0, 1]
    mine = MINE(alpha=0.4, c=15, est='mic_e')
    mine.compute_score(x, y)
    mic = mine.mic()
    return mic, r
Ejemplo n.º 8
0
def calculate_mic(df, y):
    max_info = MINE()
    mics ={}
    for column in df.columns:
        max_info.compute_score(df.loc[:, column], y.values)
        mics[column] = max_info.mic()
    return pd.Series(mics)
Ejemplo n.º 9
0
def _evaluate_single(data, target_feature):
    mine = MINE(alpha=0.3, c=15)
    MICs = list()
    for i in range(data.shape[1]):
        mine.compute_score(target_feature,data[:,i])
        MICs.append(mine.mic())
    return(MICs)
Ejemplo n.º 10
0
class MaximalInformationCorrelator(Correlator, ABC):
    """
    Implements lag derivation for metric time series based on maximal information-based nonparametric exploration.
    Reference: https://minepy.readthedocs.io/en/latest/python.html
    """
    @abstractmethod
    def _compute_correlation_internal(self):

        pass

    def __init__(self, config: dict):

        super().__init__(config)

        alpha = ErrorChecker.key_check_and_load('alpha', config, default=0.6)
        if alpha <= 0 or (alpha > 1 and alpha < 4):
            raise ValueError('Alpha should be in the range (0, 1] or [4, inf)')

        c = ErrorChecker.key_check_and_load('c', config, default=15)
        if c <= 0:
            raise ValueError('c has to be greater than 0')

        est = ErrorChecker.key_check_and_load('est',
                                              config,
                                              default='mic_approx')

        self.estimator = MINE(alpha=alpha, c=c, est=est)

    def _compute_correlation(self, metrics_vals_1: pd.Series,
                             metrics_vals_2: pd.Series, lag: int):

        self.estimator.compute_score(metrics_vals_1,
                                     metrics_vals_2.shift(lag).fillna(0))
        return self._compute_correlation_internal()
Ejemplo n.º 11
0
	def MIC_plot(self, x, y, numRows, numCols, plotNum, x_name, y_name, filename):
		# build the MIC and correlation plot using the covariant matrix using a vectorized implementation. To be used when
		# categorical features are part of the model (otherwise, Pearson, Kendall and Spearman can be used)
		print "Pearson product-moment correlation coefficients np.corrcoef(x=",x_name,", y=",y_name,"): ",np.corrcoef(x, y)
		r = np.around(np.corrcoef(x, y)[0, 1], 1)  # Pearson product-moment correlation coefficients.
		# TODO: compute cov matrix for each one-hot encoding variable of the categorical feature with
		# MINE's Mutual Information coefficient

		fig = plt.figure(figsize=(33,5), frameon=True)#, ms=50)
		mine = MINE(alpha=0.6, c=15, est="mic_approx")
		mine.compute_score(x, y)
		mic = np.around(mine.mic(), 1)
		ax = plt.subplot(numRows, numCols, plotNum)
		ax.set_xlim(xmin=min(x)+1, xmax=max(x)+1)
		ax.set_ylim(ymin=min(y)+1, ymax=max(y)+1)
		ax.set_title('Pearson r=%.1f\nMIC=%.1f Features %s and %s in %s' % (r, mic, x_name, y_name, filename),fontsize=10)
		ax.set_frame_on(False)
		ax.axes.get_xaxis().set_visible(True)
		ax.axes.get_yaxis().set_visible(True)
		ax.plot(x, y, '*')
		plt.xlabel('X')
		plt.ylabel('Y')
		# ax.set_xticks([])
		# ax.set_yticks([])
	#     plt.scatter(x,y,s=s)
	#     plt.show()
		return ax
def mic(dataset: pd.DataFrame, labels: np.array) -> dict:
    score = {feature: None for feature in dataset}
    for feature, x in dataset.items():
        mine = MINE()
        mine.compute_score(x.values.ravel(), labels)
        score[feature] = mine.mic()
    return score
def mic(points):
    points = np.transpose(points)
    mine = MINE()
    mine.compute_score(points[0], points[1])
    Mic =  mine.mic()
    del points
    return Mic
Ejemplo n.º 14
0
def mutual_infomation_rank(col_names, X, y, topK=10):
    '''
    互信息特征重要性检测

    :param col_names: 特征名,list
    :param X: 特征矩阵,numpy 2D array
    :param y: 标签向量,numpy array
    :param topK: 输出前k个变量
    :return: 排序后的特征dataframe,含权重和置信度
    '''

    # 因为互信息计算较慢,进行采样后再计算
    original_size = len(y)
    sampling_size = 2000 if original_size > 2000 else original_size
    X, y = resample(X, y, random_state=0, n_samples=sampling_size)

    mine = MINE(alpha=0.6, c=15, est="mic_approx")

    scores = []
    for i in range(0, len(col_names)):
        mine.compute_score(X[:, i], y)
        scores.append(mine.mic())

    result_df = pd.DataFrame({'name': col_names, 'mutual_information': scores})

    result_df = result_df[['name', 'mutual_information'
                           ]].sort_values('mutual_information',
                                          ascending=False)

    print "size={m} sampling={s} features={n} top{k} rank for MINE testing:" \
        .format(m=original_size, s=sampling_size, n=len(col_names), k=topK)
    print result_df.head(topK)
    return result_df
Ejemplo n.º 15
0
def calculateCorrelationBetweenVectors(x,y):
	#x = scipy.array([-0.65499887,  2.34644428, 3.0])
 	#y = scipy.array([-1.46049758,  3.86537321, 21.0])
	#The Pearson correlation coefficient measures the linear relationship between two datasets. 
	#Strictly speaking, Pearson correlation requires that each dataset be normally distributed. 
	#correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. 
	#Correlations of -1 or +1 imply an exact linear relationship. 

	#The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. 
	#The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so.
	#print "X = " , x, "\nY = ", y
	#corr, p_value = pearsonr(x, y)
	commonSize = 0
	if(len(x) < len(y)):
		commonSize = len(x)
	else:
		commonSize = len(y)
	x_sorted = np.sort(x)
	y_sorted = np.sort(y)
	
	x_sorted = x_sorted[ : (commonSize - 1)]
	y_sorted = y_sorted[ : (commonSize - 1)]
	
	x_scaled = preprocessing.scale(x_sorted)
	y_scaled = preprocessing.scale(y_sorted)

	mine = MINE(alpha=0.6, c=15)
        mine.compute_score(x_scaled, y_scaled)	
	corr = float(mine.mic())
	#return 
	#print "correlation :", corr
	return corr
Ejemplo n.º 16
0
def McOne(data, label, r):
    print("McOne start...")
    classLabel = label
    dataMat = data.values
    n = data.shape[0]
    micFC = [0] * n
    Subset = [-1] * n
    numSubset = 0
    for i in range(n):
        m = MINE()
        m.compute_score(dataMat[i], classLabel)
        micFC[i] = m.mic()
        if micFC[i] >= r:
            Subset[numSubset] = i
            numSubset += 1
    Subset = Subset[:numSubset]
    Subset.sort(key=lambda x: micFC[x], reverse=True)
    e = 0
    while e <= numSubset - 1:
        q = e + 1
        while q <= numSubset - 1:
            m = MINE()
            m.compute_score(dataMat[Subset[e]], dataMat[Subset[q]])
            if m.mic() >= micFC[Subset[q]]:
                for i in range(q, numSubset - 1):
                    Subset[i] = Subset[i + 1]
                numSubset -= 1
            else:
                q += 1
        e += 1
    return data.iloc[Subset[:numSubset]]
Ejemplo n.º 17
0
def calMIC(data):
    for i in range(5):
        mine = MINE(alpha=0.6, c=15)
        miles = data[data.veh == (i + 2)].iloc[:, 1]
        weight = data[data.veh == (i + 2)].iloc[:, 2]
        mine.compute_score(miles, weight)
        print("Without noise:", "MIC", mine.mic())
Ejemplo n.º 18
0
def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) :
    mic_map = {};
    for dataFileName in dataFileArray :
        if data_mark is None:
            data_mark = DATA_MARK;
        _fileName = os.path.join(data_mark, dataFileName);
        student_data,headerArray = load_data_from_file(_fileName);

        _score_map = get_final_score_map();
        _score_array = [];
        for _student_record in student_data:
            _score_array.append(_score_map[_student_record[0]]);

        featureCount = headerArray.__len__() - 1;

        if(neadNorm):
            _score_array =normizeDataSet(_score_array);

        #计算皮尔森相关系数 并输出成markdown形式
        m = MINE()
        for index in range(1,featureCount+1) :
            dataArray = getOneColumn(student_data,index);
            if (neadNorm):
                dataArray = normizeDataSet(dataArray);
            m.compute_score(dataArray,_score_array);
            mic_map[headerArray[index]] = m.mic();

    sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True);
    threhold = np.mean(list(mic_map.values()));
    for header,value in sorted_list:
        if value > threhold:
            print(header,value)
Ejemplo n.º 19
0
def micfliter(data,rate):
  
    """
        MIC feture selection function
        Arguments: 
          data:  Pandas DataFrame of 
          rate:  Float in range(0,1)
        Return: 
          List of to drop
    """

    m = MINE()  
    micmatrix = []
    for colx in data.columns:
        micResult = []
        for coly in data.columns:
            m.compute_score(np.array(data[coly]), np.array(data[colx])) 
            micResult.append(m.mic())
        micmatrix.append(micResult)
        
    micmatrix = pd.DataFrame(micmatrix,columns=data.columns)
    upper = micmatrix.where(np.triu(np.ones(micmatrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column]>rate)]
    
    return to_drop
Ejemplo n.º 20
0
def MicEvaluate(dataX, dataY, name, pre_indices):
    '''
    计算每一个条件属性与决策属性之间的最大信息系数
    :param dataX:
    :param dataY:
    :param name:
    :return:
    '''
    dataY = dataY.reshape(1, -1)[0]
    nFeatures = len(dataX[0])
    print("输入特征数为:", nFeatures)
    coorArray = [] * nFeatures
    mine = MINE(alpha=0.6, c=15)
    for i in range(0, nFeatures):
        l = [x[i] for x in dataX]
        mine.compute_score(l, dataY)
        temp = mine.mic()
        coorArray.append(abs(temp))
    print("上一层留下的每个特征的最大互信息系数", coorArray)
    coorIndex = np.argsort(coorArray)
    coorIndex_ = []
    #返回最初的特征索引
    for i in coorIndex:
        coorIndex_.append(pre_indices[i])
    coorArray = np.array(coorArray)
    print("MIC相关系数:")

    print("特征:", dict(zip(name[coorIndex_], coorArray[coorIndex])))
    name_coorArray = dict(zip(name[coorIndex_], coorArray[coorIndex]))
    return coorIndex_, coorArray, name_coorArray
Ejemplo n.º 21
0
def compute_MIC(x, y, alpha=0.6, c=15, all_metrics=False):
    from minepy import MINE
    mine = MINE(alpha, c)
    mine.compute_score(x, y)
    if all_metrics:
        return mine.mic(), mine
    else:
        return mine.mic()
def MIC():
    train_x, train_y = MLdata('new_dataset.csv')
    mic = MINE()
    l = train_x.shape[1]
    print(l)
    for i in range(l):
        mic.compute_score(train_x[:,i], train_y)
        print(i, mic.mic())
Ejemplo n.º 23
0
 def mine():
     for column in tqdm(uncorrelated, desc="Running MINE test", dynamic_ncols=True,
                        leave=False):
         mine = MINE()
         mine.compute_score(epigenomes[column].values.ravel(), labels.values.ravel())
         score = mine.mic()
         if score >= correlation_threshold:
             uncorrelated.remove(column)
Ejemplo n.º 24
0
    def MIC(self,x,y):
        mine = MINE(alpha=0.6,c=15,est="mic_approx")
        mine.compute_score(x,y)
        return mine.mic()

    
    
        
Ejemplo n.º 25
0
def feature_scoring(X, Y):
    names = ["x%s" % i for i in range(1, 37)]
    ranks = {}

    X = X.values[:, :]
    lr = LinearRegression(normalize=True)
    lr.fit(X, Y)
    ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

    ridge = Ridge(alpha=7)
    ridge.fit(X, Y)
    ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X, Y)
    ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

    #stop the search when 5 features are left (they will get equal scores)
    rfe = RFE(lr, n_features_to_select=5)
    rfe.fit(X, Y)
    ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

    rf = RandomForestRegressor()
    rf.fit(X, Y)
    ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

    f, pval = f_regression(X, Y, center=True)
    ranks["Corr."] = rank_to_dict(f, names)

    print('startMIC')
    mine = MINE()
    mic_scores = []

    for i in range(X.shape[1]):
        mine.compute_score(X[:, i], Y)
        m = mine.mic()
        mic_scores.append(m)
        print(i)
    ranks["MIC"] = rank_to_dict(mic_scores, names)

    print('finish MIc')

    r = {}
    for name in names:
        r[name] = round(
            np.mean([ranks[method][name] for method in ranks.keys()]), 2)
    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    methods.append("Mean")

    print("\t%s" % "\t".join(methods))
    for name in names:
        print("%s\t%s" % (name, "\t".join(
            map(str, [ranks[method][name] for method in methods]))))
Ejemplo n.º 26
0
def mic(x, y):
    """
    :param x:
    :param y:
    :return:
    """
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)
Ejemplo n.º 27
0
def mic_hq(X, y, cut=0.2):
    from minepy import MINE
    m = MINE()
    nf = X.shape[1]
    subs = np.array([False] * nf)
    for i in range(nf):
        m.compute_score(X[:,i], y)
        subs[i] = (m.mic() < cut)
    return(subs)
Ejemplo n.º 28
0
 def _entropy_select(self):
     """
     互信息法
     """
     m = MINE()
     mic_array = np.zeros(self.sample_num)
     for i, x in enumerate(self.x.T):
         m.compute_score(x, self.y)
         mic_array[i] = m.mic()
     self._get_top_k_ids(mic_array)
Ejemplo n.º 29
0
def MIC(X, y):
    mics = []
    for i in range(X.shape[1]):
        m = MINE()
        m.compute_score(X[:, i], y)
        mic = m.mic()
        mics += [
            mic,
        ]
    return mics
Ejemplo n.º 30
0
def toolkit_mic(arr0, arr1, alpha=0.6, c=15):
    """MIC"""

    np_temp0 = np.array(arr0)
    np_temp1 = np.array(arr1)

    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mine.compute_score(np_temp0, np_temp1)

    return mine.mic()
Ejemplo n.º 31
0
def main():
    """
    
    """
    #mine = MINE()
    #hsic_lasso = HSICLasso()
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    root = 0

    collist = []
    for colname in glob.glob("./run*/COL*"):
        for line in open(colname):
            if "#" in line:
                continue
            line = line.split()
            t = float(line[0])
            if t < 10000.0:
                continue
            elif 50000.0 < t:
                break
            collist.append(line)
        #break
    #MINEs, TICs =  pstats(collist)
    #print(TICs)
    for i in range(len(collist[0])-1):
        for j in range(i+1, len(collist[0])-1):
        #for j in range(i+1, i + 5):
            miclist = []
            ticlist = []
            for _ in range(10):
            #if True:
                colpart = random.sample(collist, 10000)
                #colpart = collist
                x = np.array([a[i+1] for a in colpart], dtype=float)
                y = np.array([a[j+1] for a in colpart], dtype=float)
                #xy = np.array([x,y])
                #mine = MINE()
                mine = MINE(est="mic_e")
                mine.compute_score(x,y)
                miclist.append(mine.mic())
                ticlist.append(mine.tic())
            #miclist = comm.gather(mine.mic(), root=0)
            #ticlist = comm.gather(mine.tic(), root=0)

            #hsic_lasso.input(xy, np.array([0,1]))
            #hsic_lasso.input(np.array([[1, 1, 1], [2, 2, 2]]), np.array([0, 1]))
            #hsic_lasso.regression(5)
            #hsic_lasso.classification(10)
            #print(hsic_lasso.dump())
            if rank == root:
                print("%s,%s, %s, %s"%(i,j,np.mean(miclist),np.mean(ticlist)), flush = True)
                with open("./minedata.csv", "a") as wf:
                    wf.write("%s, %s, %s, %s\n"%(i,j,np.mean(miclist),np.mean(ticlist)))
Ejemplo n.º 32
0
def MIC(features, labels):
    mine = MINE()
    mic_scores = []

    labels = labels.flatten()
    for i in range(features.shape[1]):
        mine.compute_score(features[:, i], labels)
        m = mine.mic()
        mic_scores.append(m)

    return mic_scores
Ejemplo n.º 33
0
    def execute(self, symbol):
        """
        :param symbol: the symbol in which we are looking for correlations
        :type symbol: :class:`netzob.Common.Models.Vocabulary.AbstractField.AbstractField`
        """

        (attributeValues_headers, attributeValues) = self._generateAttributeValuesForSymbol(symbol)
        symbolResults = []

        # MINE computation of each field's combination
        for i, values_x in enumerate(attributeValues[:-1]):
            for j, values_y in enumerate(attributeValues[i + 1 :]):
                mine = MINE(alpha=0.6, c=15)
                mine.compute_score(numpy.array(values_x), numpy.array(values_y))
                mic = round(mine.mic(), 2)
                if mic > float(self.minMic):
                    # We add the relation to the results
                    (x_fields, x_attribute) = attributeValues_headers[i]
                    (y_fields, y_attribute) = attributeValues_headers[j]
                    # The relation should not apply on the same field
                    if len(x_fields) == 1 and len(y_fields) == 1 and x_fields[0].id == y_fields[0].id:
                        continue
                    pearson = numpy.corrcoef(values_x, values_y)[0, 1]
                    if not numpy.isnan(pearson):
                        pearson = round(pearson, 2)
                    relation_type = self._findRelationType(x_attribute, y_attribute)
                    self._debug_mine_stats(mine)
                    self._logger.debug(
                        "Correlation found between '"
                        + str(x_fields)
                        + ":"
                        + x_attribute
                        + "' and '"
                        + str(y_fields)
                        + ":"
                        + y_attribute
                        + "'"
                    )
                    self._logger.debug("  MIC score: " + str(mic))
                    self._logger.debug("  Pearson score: " + str(pearson))
                    id_relation = str(uuid.uuid4())
                    symbolResults.append(
                        {
                            "id": id_relation,
                            "relation_type": relation_type,
                            "x_fields": x_fields,
                            "x_attribute": x_attribute,
                            "y_fields": y_fields,
                            "y_attribute": y_attribute,
                            "mic": mic,
                            "pearson": pearson,
                        }
                    )
        return symbolResults
Ejemplo n.º 34
0
def mic (X, Y):
    new_X , new_Y = remove_pairs_with_a_missing(X, Y)
    try:
        import minepy
        from minepy import MINE
    except (ImportError):
        sys.exit("CRITICAL ERROR:2 Unable to import minepy package." + 
            " Please check your install.") 
    mine = MINE(alpha=0.6, c=15)
    mine.compute_score(new_X , new_Y)
    return mine.mic(),  None
Ejemplo n.º 35
0
def mine_features(data,features):
    print '...'
    for X_hat_idx in features:
        features.remove(X_hat_idx)
        subset =  features
        for xi_idx in subset:
            m = MINE()
            X_hat = data[X_hat_idx].values
            xi = data[xi_idx].values
            m.compute_score(X_hat,xi)
            I_X_hat_xi = m.mic()
            if I_X_hat_xi>0.10:
                print 'I({X_hat_idx},{xi_idx}): {I_X_hat_xi}'.format(X_hat_idx=X_hat_idx,xi_idx=xi_idx,I_X_hat_xi=I_X_hat_xi)         
Ejemplo n.º 36
0
def calcMICReg(df,target,col):
    """
    
    """
    m=MINE()
    if df[col].dtype.name=="category":
        g=df.groupby(by=[col])['_target_variable_'].mean()
        g=g.to_dict()
        X=df[col].values
        X=[g[x] for x in X]    
    else:
        X=df[col].values
    m.compute_score(X, target)
    
    return {col:m.mic()} 
Ejemplo n.º 37
0
def mysubplot(x, y, numRows, numCols, plotNum,
              xlim=(-4, 4), ylim=(-4, 4)):

    r = np.around(np.corrcoef(x, y)[0, 1], 1)
    mine = MINE(alpha=0.6, c=15)
    mine.compute_score(x, y)
    mic = np.around(mine.mic(), 1)
    ax = plt.subplot(numRows, numCols, plotNum,
                     xlim=xlim, ylim=ylim)
    ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.plot(x, y, ',')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax
def perform_mic_1p(p_sequences, p, cutoff=0.5, out_folder=''):
    p_sequences_t = transpose(array([list(z) for z in p_sequences])).tolist()
    mic_scores = []
    for counter1 in range(0, len(p_sequences_t) - 1):
        for counter2 in range(counter1 + 1, len(p_sequences_t)):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(p_sequences_t[counter1], p_sequences_t[counter2])
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p+'_'+str(counter1+1)
                mic_score['y'] = p+'_'+str(counter2+1)
                mic_score['p1'] = p
                mic_score['p2'] = p
                mic_score['weight'] = format(mine.mic(), '.3f')
                mic_scores.append(mic_score)
    write_mics_to_csv(mics=mic_scores, p1=p, p2=p, cutoff=cutoff, out_folder=out_folder)
    return mic_scores
Ejemplo n.º 39
0
  def select_feature(self, data, label, threshold=0.7):
    """
    Perform feature selection by maximum information coefficient that can capture both linear and non-linear relationships.
    """
    selected = []

    from minepy import MINE
    mine = MINE()

    for i, col in enumerate(data):
      print 'feature selection: %d/%d %s' % (i, data.shape[1], col)
      mine.compute_score(data[col], label)
      if mine.mic() > threshold:
        selected.append(col)

    print '%d out of %d features were selected' % (len(selected), data.shape[1])

    return selected
Ejemplo n.º 40
0
def get_corrcoef(X):
    div = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.05, random_state=0)
    for train, test in div:
        X = X[np.array(test)]
        break

    X = X.transpose()
    pcc = np.ones((X.shape[0], X.shape[0]))
    m = MINE()
    # feat_groups = [[0], [1, 2, 3], [4, 5, 7, 8, 9, 10], [6],
    #                list(range(11, 24)), list(range(24, 29)), list(range(29, 34))]
    t = time()
    for i in range(0, 1):
        for j in range(1, 20):
            m.compute_score(X[i], X[j])
            pcc[i, j] = pcc[j, i] = m.mic()  # np.corrcoef(X[i], X[j])[0, 1]
            print(i, j, pcc[i, j], time()-t)
    np.savetxt(os.path.join(CODE_PATH, 'feat_sim_pcc_2.csv'), pcc, fmt='%.3f', delimiter=',')
    print('Done with computing PCC,', 'using', time()-t, 's')
Ejemplo n.º 41
0
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5):
    mic_scores = []
    p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist()
    p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist()

    for idx1, record1 in enumerate(p1_sequences_t):
        for idx2, record2 in enumerate(p2_sequences_t):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(record1, record2)
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p1+'_'+str(idx1+1)
                mic_score['y'] = p2+'_'+str(idx2+1)
                mic_score['p1'] = p1
                mic_score['p2'] = p2
                mic_score['weight'] = mine.mic()
                mic_scores.append(mic_score)

    #print('computed ', len(mic_scores), ' mics for ', p1, p2, 'for cutoff ', cutoff)
    return mic_scores
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5):
    mic_scores = []
    p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist()
    p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist()

    for idx1, record1 in enumerate(p1_sequences_t):
        for idx2, record2 in enumerate(p2_sequences_t):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(record1, record2)
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p1+'_'+str(idx1+1)
                mic_score['y'] = p2+'_'+str(idx2+1)
                mic_score['p1'] = p1
                mic_score['p2'] = p2
                mic_score['weight'] = format(mine.mic(), '.3f')
                mic_scores.append(mic_score)

    write_mics_to_csv(mics=mic_scores, p1=p1, p2=p2, cutoff=cutoff)
    return mic_scores
Ejemplo n.º 43
0
    def mutual_information(self, X, Y, title=None, nbins_X=50, nbins_Y=50,
            noise_sigma='all'):
        #import pdb; pdb.set_trace()
        no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y)))
        Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X)
        Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y)
        s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X))
        s.calculate_entropies()

        # MINE
        mine = MINE()
        mine.compute_score(X.flatten(), Y.flatten())

        # Linear regression
        slope, intercept, r, p, stderr = \
                scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx])

        #import pdb; pdb.set_trace()
        if title is not None:
            print(title)
        print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" %
                (noise_sigma, mine.mic(), s.I(), r**2, p, slope))
Ejemplo n.º 44
0
    def fit(self,X,y):
        # initialize phi and feature set
        # if number of features is not set, half of the features will be selected
        n = self.n
        beta = self.beta
        verbose = self.verbose
        if n ==None:
            n = int(X.shape[0]/2)

        features = np.arange(X.shape[1]).tolist()
        best_mi = -np.inf
        X_hat = 0
        for xi in features:
            m = MINE()
            m.compute_score(X[:,xi],y)
            #compute I(xi,y) and get max xi
            mi_xi_y = m.mic()
            if best_mi<mi_xi_y:
                X_hat = xi
        phi = [X_hat]
        features.remove(X_hat)
        # get paris for elements in phi and features
        while len(phi)<n:
            mi_scores = np.zeros(len(features))
            for xi_idx,xi in enumerate(features):
                m = MINE()
                m.compute_score(X[:,xi],y)
                #compute I(xi,y)
                mi_xi_y = m.mic()
                sum_mi_xi_xj = 0
                for xj in phi:
                    # compute I(xi,xj) and save for further evaluation
                    m = MINE()
                    m.compute_score(X[:,xi],X[:,xj])
                    mi_xi_xj = m.mic()
                    sum_mi_xi_xj+=mi_xi_xj
                mi_scores[xi_idx] = mi_xi_y - beta*sum_mi_xi_xj
                if verbose>=2:
                    print "mi_scores for xi:{xi}, xj:{xj} is {mi_scores}".format(xi=xi,xj=xj,mi_scores=mi_scores[xi_idx])

            X_hat = np.argmax(mi_scores)
            if verbose==1:
                print "X_hat is {X_hat}".format(X_hat=X_hat)
            X_hat = features[X_hat]
            phi.append(X_hat)
            features.remove(X_hat)
        self.phi = phi
        self.features = features
 def get_mic(self):
     m = MINE()
     m.compute_score(self.x, self.y)
     return m.mic()
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

#RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

#f_regression
f, pval  = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)

#MINE
mine = MINE()
mic_scores = []
for i in range(X.shape[1]):
    mine.compute_score(X[:,i], Y)
    m = mine.mic()
    mic_scores.append(m)

ranks["MIC"] = rank_to_dict(mic_scores, names)

#----statistics--out---------
r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)

methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")
Ejemplo n.º 47
0
def train_and_analyse(_X, _y, features):
	X = _X
	Y = _y
	cv_l = cross_validation.KFold(X.shape[0], n_folds=10,
								shuffle=True, random_state=1)
	ranks = {}

	lr = LinearRegression(normalize=True)
	lr.fit(X, Y)
	ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	

	ridge = RidgeCV(cv=cv_l)
	ridge.fit(X, Y)
	ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
	lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000)
	lasso.fit(X, Y)
	ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features)
	
	rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42)
	rlasso.fit(X, Y)
	ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features)
	
	rfe = RFE(lr, n_features_to_select=1)
	rfe.fit(X,Y)
	ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1)

	rf = RandomForestRegressor(n_estimators=500)
	rf.fit(X,Y)
	ranks["RF"] = rank_to_dict(rf.feature_importances_, features)

	f, pval  = f_regression(X, Y, center=True)
	ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features)

	mine = MINE()
	mic_scores = []
	for i in range(X.shape[1]):
	   mine.compute_score(X[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = rank_to_dict(mic_scores, features) 

	r = {}
	for name in features:
	    r[name] = round(np.mean([ranks[method][name] 
	                             for method in ranks.keys()]), 2)
	 
	methods = sorted(ranks.keys())
	ranks["Mean"] = r
	methods.append("Mean")
	
	ranks = pd.DataFrame(ranks)

	selection_feature = ranks[ranks.Mean > 0.12].index.values

	return ranks, selection_feature
Ejemplo n.º 48
0
    f2=f_input2.T

Mdim=len(f1)
Mat=np.zeros((Mdim, Mdim))

# =============================================================================
#
#                    Compute MIC, PCC, KTau, NMIS Algorithm
#                      & Generate Correlation Matrix
#
# =============================================================================
print 'Computing mutual information indices and generating correlation matrix...'
for i in range(Mdim):
    for j in range(Mdim):
        if mla == 'MIC':
            mine.compute_score(f1[i],f2[j])
            Mat[i][j] = mine.mic()
        elif mla == 'PCC':
            Mat[i][j] = pearsonr(f1[i],f2[j])[0]
        elif mla == 'KTau':
            Mat[i][j] = kendalltau(f1[i], f2[j])[0]
        elif mla == 'NMIS':
            Mat[i][j] = normalized_mutual_info_score(f1[i],f2[j])
        sys.stdout.write(".")
        g=open(output_dir+'/'+'CorrMatrix_'+mla+'_'+str(Mdim)+'_'+str(GPS)+'_'+nfilename+'.txt','a')
        if j==Mdim-1:
            g.write(str(Mat[i][j]))
            g.write('\n')
        else:
            g.write(str(Mat[i][j]))
            g.write(' ')
Ejemplo n.º 49
0
class TestFunctions(unittest.TestCase):

    def setUp(self):
        self.mine = MINE(alpha=0.6, c=15)

    def build_const(self, n):
        x = np.linspace(0, 1, n)
        y = np.zeros(n)
        return x, y

    def build_linear(self, n):
        x = np.linspace(0, 1, n)
        return x, x

    def build_sine(self, n):
        x = np.linspace(0, 1, n)
        return x, np.sin(8*np.pi*x)

    def build_exp(self, n):
        x = np.linspace(0, 10, n)
        return x, 2**x

    def test_const(self):
        x, y = self.build_const(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 0., 4)
        assert_almost_equal(self.mine.mas(), 0., 4)
        assert_almost_equal(self.mine.mev(), 0., 4)
        assert_almost_equal(self.mine.mcn(), 2., 4)
        assert_almost_equal(self.mine.mcn_general(), 2., 4)

    def test_linear(self):
        x, y = self.build_linear(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 1., 4)
        assert_almost_equal(self.mine.mas(), 0., 4)
        assert_almost_equal(self.mine.mev(), 1., 4)
        assert_almost_equal(self.mine.mcn(), 2., 4)
        assert_almost_equal(self.mine.mcn_general(), 2., 4)

    def test_linear(self):
        x, y = self.build_linear(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 1., 4)
        assert_almost_equal(self.mine.mas(), 0., 4)
        assert_almost_equal(self.mine.mev(), 1., 4)
        assert_almost_equal(self.mine.mcn(), 2., 4)
        assert_almost_equal(self.mine.mcn_general(), 2., 4)

    def test_sine(self):
        x, y = self.build_sine(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 1., 4)
        assert_almost_equal(self.mine.mas(), 0.875, 3)
        assert_almost_equal(self.mine.mev(), 1., 4)
        assert_almost_equal(self.mine.mcn(), 4., 4)
        assert_almost_equal(self.mine.mcn_general(), 4., 4)

    def test_exp(self):
        x, y = self.build_exp(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 1., 4)
        assert_almost_equal(self.mine.mas(), 0., 4)
        assert_almost_equal(self.mine.mev(), 1., 4)
        assert_almost_equal(self.mine.mcn(), 2., 4)
        assert_almost_equal(self.mine.mcn_general(), 2., 4)
Ejemplo n.º 50
0
    for j, f in enumerate(ff):
        print "Noise: %d, function: %d" % (i, j)

        mic_approx_null, mic_e_null, tic_e_null, r2_null = [], [], [], []
        mic_approx_alt, mic_e_alt, tic_e_alt, r2_alt = [], [], [], []

        # null hypothesis
        for k in range(1, n_null+1):
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()

            # resimulate x for the null scenario
            x = np.random.rand(n)

            mine_approx.compute_score(x, y)
            mine_e.compute_score(x, y)

            mic_approx_null.append(mine_approx.mic())
            mic_e_null.append(mine_e.mic())
            tic_e_null.append(mine_e.tic())
            r2_null.append(np.corrcoef(x, y)[0][1]**2)

        # alternative hypothesis
        for k in range(1, n_alt+1):
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()

            mine_approx.compute_score(x, y)
            mine_e.compute_score(x, y)
Ejemplo n.º 51
0
def train_and_analyse(_X, _y, sno, ino):
	X = _X.copy()
	Y = _y
	features = X.columns.values
	cv_l = cross_validation.KFold(X.shape[0], n_folds=5,
								shuffle=True, random_state=1)
	ranks_linear = {}
	ranks_nonlinear= {}
	ranks_path = {}
	ranks = {}

	selection_feature = []

	time_feature_1 = [
					'date2j'
					]
	time_feature_2 = [
					'day',
					'month',
					'year'
					]

	time_feature_3 = [
					'is_2012', 
					'is_2013', 
					'is_2014',
					'fall', 
					'winter', 
					'spring',
					'summer'
					]

	time_feature_4 = [
					'weekday',
					'is_weekend', 
					'is_holiday', 
					'is_holiday_weekday', 
					'is_holiday_weekend',
					]

	time_feature_5 = [
					'MemorialDay', 
					'MothersDay', 
					'BlackFridayM3',
					'BlackFriday1', 
					'NewYearsDay', 
					'IndependenceDay', 
					'VeteransDay',
					'BlackFriday2', 
					'NewYearsEve', 
					'BlackFriday3', 
					'ChristmasDay',
					'BlackFridayM2', 
					'ThanksgivingDay', 
					'Halloween', 
					'EasterSunday',
					'ChristmasEve', 
					'ValentinesDay', 
					'PresidentsDay', 
					'ColumbusDay',
					'MartinLutherKingDay', 
					'LaborDay', 
					'FathersDay', 
					'BlackFriday'
					]

	weather_feature =  [
					'high_precip', 
					'preciptotal', 
					'snowfall', 
					'high_snow',
					'avgspeed', 
					'windy', 
					'temp_missing', 
					'tavg', 
					'hot', 
					'cold', 
					'frigid',
					'thunder', 
					'snowcode', 
					'raincode'
					]
	temp = time_feature_1 + time_feature_2 + time_feature_3 + time_feature_4 + time_feature_5
	X_f1 = X[temp].values
	# lr = LinearRegression(normalize=True)
	# lr.fit(X, Y)
	# ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	
	f, pval  = f_regression(ut.get_processed_X_A(X_f1), Y, center=True)
	ranks["F_regr"] = pd.Series(rank_to_dict(np.nan_to_num(f), temp))
	# print('asd')
	# mi = mutual_info_regression(ut.get_processed_X_A(X_f1), Y)
	# mi /= np.max(mi)
	# ranks['MI'] = Pd.Series()

	mine = MINE()
	mic_scores = []
	for i in range(ut.get_processed_X_A(X_f1).shape[1]):
	   mine.compute_score(ut.get_processed_X_A(X_f1)[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = pd.Series(rank_to_dict(mic_scores, temp))
	


	# ridge.fit(X, Y)
	# ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
	# to avoid exploring the regime in which very noisy variables enter
	# the model
	# rlasso = RandomizedLasso(alpha='bic', normalize=True)
	# rlasso.fit(X_f1, Y)
	# ranks_linear["Stability"] = pd.Series(rlasso.scores_)

	# alpha_grid, scores_path = lasso_stability_path(X_f1, Y, random_state=42,
 #                                                   eps=0.00005, n_grid=500)
	# for alpha, score in zip(alpha_grid, scores_path.T):
	# 	ranks_path[alpha] = score
	# ranks_path = pd.DataFrame(ranks_path).transpose()
	# ranks_path.columns = temp
	# plt.figure()
	# ranks_path.plot()
	# plt.show()
	# selection_feature.extend(ranks_linear[ranks_linear.F_regr > 0.1].index.values.tolist())
	# selection_feature.extend(ranks_linear[ranks_linear.MIC > 0.1].index.values.tolist())
	# selection_feature.extend(ranks_linear[ranks_linear.Stability > 0.1].index.values.tolist())
#-------------------------------

	# rf = RandomForestRegressor(n_estimators=150, max_depth=4, n_jobs=4, random_state=1)
	rf = ut.get_regression_model('RandomForest', 0)
	scores = []
	for i in range(X_f1.shape[1]):
	 score = cross_val_score(rf, X_f1[:, i:i+1].astype(float), Y, scoring="r2", cv=ShuffleSplit(len(X_f1), 3, .3), n_jobs=2)
	 scores.append(round(np.mean(score), 3))

	ranks['RF'] = pd.Series(rank_to_dict(np.abs(scores), temp)) 

	ranks = pd.DataFrame(ranks)
	print(ranks)
	selection_feature.extend(ranks[ranks.RF > 0.1].index.values.tolist())
	selection_feature.extend(ranks[ranks.MIC >= 0.1].index.values.tolist())
	selection_feature.extend(ranks[ranks.F_regr >= 0.1].index.values.tolist())
#-------------------------------
	selection_feature = list(set(selection_feature))
	print(selection_feature)
	# ridge = RidgeCV(cv=cv_l)
	# rfe = RFE(ridge, n_features_to_select=1)
	# rfe.fit(X[selection_feature],Y)
	# ranks["RFE"] = pd.Series(rank_to_dict(np.array(rfe.ranking_).astype(float), selection_feature, order=1))
	# ranks = pd.DataFrame(ranks)
	# print(ranks)
	# r = {}
	# for name in features:
	#     r[name] = round(np.mean([ranks[method][name] 
	#                              for method in ranks.keys()]), 2)
	 
	# methods = sorted(ranks.keys())
	# ranks["Mean"] = r
	# methods.append("Mean")

	path = 'Analyse/store_{}/'.format(sno)
	mkdir_p(path)
	path += 'item_{}_(pair_analyse)'.format(ino)
	ranks.to_pickle(path)

	path += '.png'
	p.clf()
	p.cla()
	plt.figure(figsize=(16, 26))
	ranks.plot.barh(stacked=True)
	p.savefig(path, bbox_inches='tight', dpi=300)
	plt.close()

	return ranks, selection_feature
Ejemplo n.º 52
0
def mic(x, y):
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)
Ejemplo n.º 53
0
# for i in [7,9]:
#     mine.compute_score(X_Standard_T[i], X_Standard_T[10])
#     mics.append(mine.mic())
#     print i, mine.mic()
# # for i in range(0,38):
# #     mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[38])
# #     mics.append(mine.mic())
# #     print i, mine.mic()
# for i in range(0,7):
#     mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[7])
#     mics.append(mine.mic())
#     print i, mine.mic()
#

for i in range(48):
    mine.compute_score(X_ALL_Standard_T[i], X_ALL_Standard_T[48])
    mics.append(mine.mic())

names = []
for c in allDF.columns.values: names.append(c)

map = {}
for i in range(48):
    map[names[i]] = mics[i]

import operator
sorted_tuple = sorted(map.items(), key=operator.itemgetter(1))

vs = []
ks = []
for k,v in sorted_tuple:
Ejemplo n.º 54
0
res = []
res.append(pd.read_csv("./avg_xgbs_discret_feature_5.csv").score.values)
res.append(pd.read_csv("./R_7199.csv").score.values)
res.append(pd.read_csv("./rank_feature_xgb_ensemble.csv").score.values)
res.append(pd.read_csv("./avg_xgbs_discret_feature_10.csv").score.values)
res.append(pd.read_csv("./based_on_select_rank_feature.csv").score.values)
res.append(pd.read_csv("./xgb717.csv").score.values)
res.append(pd.read_csv("./725.csv").score.values)
res.append(pd.read_csv("./svm6938.csv").score.values)

cm = []
for i in range(8):
    tmp = []
    for j in range(8):
        m = MINE()
        m.compute_score(res[i], res[j])
        tmp.append(m.mic())
    cm.append(tmp)


import numpy as np
import matplotlib.pyplot as plt


def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(8)
    plt.xticks(tick_marks, fs, rotation=45)
    plt.yticks(tick_marks, fs)
Ejemplo n.º 55
0
def mic(x, y):
	m = MINE()
	print x
	print y
	m.compute_score(x, y)
	return (m.mic(), 0.5)
Ejemplo n.º 56
0
for i in range(1, n_noise+1):
    for j, f in enumerate(ff):
        mic_null, gmic_null, r2_null = [], [], []
        mic_alt, gmic_alt, r2_alt = [], [], []

        # null hypothesis
        for k in range(1, n_null+1):
            print i, j, k
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()
            
            # resimulate x for the null scenario
            x = np.random.rand(n)

            mine.compute_score(x, y)
            mic_null.append(mine.mic())
            gmic_null.append(mine.gmic(p=-1))
            r2_null.append(np.corrcoef(x, y)[0][1]**2)

        # alternative hypothesis
        for k in range(1, n_alt+1):
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()

            mine.compute_score(x, y)
            mic_alt.append(mine.mic())
            gmic_alt.append(mine.gmic(p=-1))
            r2_alt.append(np.corrcoef(x, y)[0][1]**2)
print "Lower noise", pearsonr(x, x + np.random.normal(0, 1, size))
print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))

#明显缺陷:作为特征排序机制,他只对线性关系敏感.即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0
a = np.random.uniform(-1, 1, 100000)   #uniform(low,high,size) 随机数
print pearsonr(a, a**2)[0]


#1.2 互信息和最大信息系数 (Mutual information and maximal information),[0,1]
#互信息直接用于特征选择不太方便,最大信息系数首先寻找一种最优的离散化方式,
#然后把互信息取值转换成一种度量方式,取值区间在[0,1]。minepy提供了MIC功能。

from minepy import MINE  #
m = MINE()
x = np.random.uniform(-1, 1, 10000)
m.compute_score(x, x**2)
print m.mic()


#1.3 距离相关系数 (Distance correlation),[0,1]
#距离相关系数是为了克服Pearson相关系数的弱点而生的。在x和x^2这个例子中,即便Pearson相关系数是0,
#我们也不能断定这两个变量是独立的(有可能是非线性相关);但如果距离相关系数是0,那么我们就可以说这两个变量是独立的。
import numpy as np

def dist(x, y):
    #1d only
    return np.abs(x[:, None] - y)
    

def d_n(x):
    d = dist(x, x)
Ejemplo n.º 58
0
def doMICAnalysisOfInputVariables(inArr, targetArr,targetName, mic_score_threshold,input_indexes_uncorrelated_features,targetQualityMap = None):
	#if(targetQuality == None):
	#	return inArr
	#print inArr
	#global inputColumnNameToIndexMapFromFile
        #global measuredColumnNameToIndexMapFromFile
        #global outputColumnNameToIndexMapFromFile

 	#print "\n\n\n doMICAnalysisOfInputVariables called \n\n"
	
	goodTargetMap = getGlobalObject("goodTargetMap")

	selected_inArr = []
	selected_inArr_indexes = []
	selected_originalColumn_indexes = []

	inColMap = getGlobalObject("inputColumnIndexToNameMapFromFile") #keys are col index and vals are names
	#selected_inArr.append([])
	#print "doMICAnalysisOfInputVariables: ", "inArr.shape: ", inArr.shape
	#print "doMICAnalysisOfInputVariables: ", "targetArr.shape: ", targetArr.shape

	numOfFeatures = 0
	try:
		#(rows,numOfFeatures) = inArr.shape
		numOfFeatures = inArr.shape[1]
	except:
		print "ERROR: \n", inArr
		exit(0)
	k = 0	
	for featureIndex in range(numOfFeatures):
	#for i in inColMap.keys():
		#x = inArr[:,i]
		#x = inArr[:,k]
		# we will choose only uncorrelated features as input
		if(featureIndex not in input_indexes_uncorrelated_features):
			continue

		x = inArr[:,featureIndex]
		#print "x: ", x
		x_scaled = preprocessing.scale(x)
		#print "x: ", x_scaled
		#print "targetArr: ", targetArr 
		mine = MINE(alpha=0.6, c=15)
		mine.compute_score(x_scaled, targetArr)
		#print getGlobalObject("inputColumnNameToIndexMapFromFile")
		#inputFeatureName = getGlobalObject("inputColumnNameToIndexMapFromFile")[i]
		#inputFeatureName = inColMap[i]
		#inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex)
		#inputFeatureName = getInputParameterNameFromColumnIndex(featureIndex)
		inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex)
		print_stats(mine,inputFeatureName,targetName,mic_score_threshold)
		if(targetQualityMap != None):
			targetQualityMap.append(float(mine.mic()))
		#l = list(x)
		#selected_inArr = np.concatenate((selected_inArr, np.array(l)), axis=0)
		#print k
		#print mine.mic()
		if(float(mine.mic()) >= mic_score_threshold):
			selected_inArr.append(x) #keep the input data column
			selected_inArr_indexes.append(k) #keep the index corresponding to that column
			colIdx = getColumnIndexFromFeatureIndex(featureIndex)
			selected_originalColumn_indexes.append(colIdx) #keep the original column index corresponding to that column
			#now add the target itself to goodTargetMap. For anomaly detection we will only use these targets
			goodTargetMap[targetName] = True
			print "----------------- selected: ", inputFeatureName, colIdx, k
			k = k + 1	
		
	selected_inArr = np.array(selected_inArr).transpose()
	#print "\n **** selected: ==== \n", selected_inArr, selected_inArr_indexes,selected_originalColumn_indexes
        return selected_inArr, selected_inArr_indexes, selected_originalColumn_indexes
Ejemplo n.º 59
0
 def interactionV(self, data):
     from minepy import MINE
     m = MINE()
     m.compute_score(data, x**2)
     print(m.mic())
Ejemplo n.º 60
-1
def performMIC(transposed_list):
    mic_scores=[]
    for counter1 in range(0, len(transposed_list)-1):
        for counter2 in range(counter1+1, len(transposed_list)):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(transposed_list[counter1], transposed_list[counter2])
            if (mine.mic()>0.6):
                mic_score={}
                mic_score['x']=counter1
                mic_score['y']=counter2
                mic_score['mic']=mine.mic()
                mic_scores.append(mic_score)
    return mic_scores