コード例 #1
0
def McOne(data, label, r):
    print("McOne start...")
    classLabel = label
    dataMat = data.values
    n = data.shape[0]
    micFC = [0] * n
    Subset = [-1] * n
    numSubset = 0
    for i in range(n):
        m = MINE()
        m.compute_score(dataMat[i], classLabel)
        micFC[i] = m.mic()
        if micFC[i] >= r:
            Subset[numSubset] = i
            numSubset += 1
    Subset = Subset[:numSubset]
    Subset.sort(key=lambda x: micFC[x], reverse=True)
    e = 0
    while e <= numSubset - 1:
        q = e + 1
        while q <= numSubset - 1:
            m = MINE()
            m.compute_score(dataMat[Subset[e]], dataMat[Subset[q]])
            if m.mic() >= micFC[Subset[q]]:
                for i in range(q, numSubset - 1):
                    Subset[i] = Subset[i + 1]
                numSubset -= 1
            else:
                q += 1
        e += 1
    return data.iloc[Subset[:numSubset]]
コード例 #2
0
    def fit(self, X, y):
        # initialize phi and feature set
        # if number of features is not set, half of the features will be selected
        n = self.n
        beta = self.beta
        verbose = self.verbose
        if n == None:
            n = int(X.shape[0] / 2)

        features = np.arange(X.shape[1]).tolist()
        best_mi = -np.inf
        X_hat = 0
        for xi in features:
            m = MINE()
            m.compute_score(X[:, xi], y)
            #compute I(xi,y) and get max xi
            mi_xi_y = m.mic()
            if best_mi < mi_xi_y:
                X_hat = xi
        phi = [X_hat]
        features.remove(X_hat)
        # get paris for elements in phi and features
        while len(phi) < n:
            mi_scores = np.zeros(len(features))
            for xi_idx, xi in enumerate(features):
                m = MINE()
                m.compute_score(X[:, xi], y)
                #compute I(xi,y)
                mi_xi_y = m.mic()
                sum_mi_xi_xj = 0
                for xj in phi:
                    # compute I(xi,xj) and save for further evaluation
                    m = MINE()
                    m.compute_score(X[:, xi], X[:, xj])
                    mi_xi_xj = m.mic()
                    sum_mi_xi_xj += mi_xi_xj
                mi_scores[xi_idx] = mi_xi_y - beta * sum_mi_xi_xj
                if verbose >= 2:
                    print "mi_scores for xi:{xi}, xj:{xj} is {mi_scores}".format(
                        xi=xi, xj=xj, mi_scores=mi_scores[xi_idx])

            X_hat = np.argmax(mi_scores)
            if verbose == 1:
                print "X_hat is {X_hat}".format(X_hat=X_hat)
            X_hat = features[X_hat]
            phi.append(X_hat)
            features.remove(X_hat)
        self.phi = phi
        self.features = features
コード例 #3
0
ファイル: rat.py プロジェクト: mcreddy91/Network-Classifier
def _evaluate_single(data, target_feature):
    mine = MINE(alpha=0.3, c=15)
    MICs = list()
    for i in range(data.shape[1]):
        mine.compute_score(target_feature,data[:,i])
        MICs.append(mine.mic())
    return(MICs)
コード例 #4
0
def calculate_mic(df, y):
    max_info = MINE()
    mics ={}
    for column in df.columns:
        max_info.compute_score(df.loc[:, column], y.values)
        mics[column] = max_info.mic()
    return pd.Series(mics)
コード例 #5
0
	def MIC_plot(self, x, y, numRows, numCols, plotNum, x_name, y_name, filename):
		# build the MIC and correlation plot using the covariant matrix using a vectorized implementation. To be used when
		# categorical features are part of the model (otherwise, Pearson, Kendall and Spearman can be used)
		print "Pearson product-moment correlation coefficients np.corrcoef(x=",x_name,", y=",y_name,"): ",np.corrcoef(x, y)
		r = np.around(np.corrcoef(x, y)[0, 1], 1)  # Pearson product-moment correlation coefficients.
		# TODO: compute cov matrix for each one-hot encoding variable of the categorical feature with
		# MINE's Mutual Information coefficient

		fig = plt.figure(figsize=(33,5), frameon=True)#, ms=50)
		mine = MINE(alpha=0.6, c=15, est="mic_approx")
		mine.compute_score(x, y)
		mic = np.around(mine.mic(), 1)
		ax = plt.subplot(numRows, numCols, plotNum)
		ax.set_xlim(xmin=min(x)+1, xmax=max(x)+1)
		ax.set_ylim(ymin=min(y)+1, ymax=max(y)+1)
		ax.set_title('Pearson r=%.1f\nMIC=%.1f Features %s and %s in %s' % (r, mic, x_name, y_name, filename),fontsize=10)
		ax.set_frame_on(False)
		ax.axes.get_xaxis().set_visible(True)
		ax.axes.get_yaxis().set_visible(True)
		ax.plot(x, y, '*')
		plt.xlabel('X')
		plt.ylabel('Y')
		# ax.set_xticks([])
		# ax.set_yticks([])
	#     plt.scatter(x,y,s=s)
	#     plt.show()
		return ax
コード例 #6
0
def mic(dataset: pd.DataFrame, labels: np.array) -> dict:
    score = {feature: None for feature in dataset}
    for feature, x in dataset.items():
        mine = MINE()
        mine.compute_score(x.values.ravel(), labels)
        score[feature] = mine.mic()
    return score
コード例 #7
0
def find_best_n_features_mic(n=8, out_path=''):
    # 计算MIC
    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mic_all = []
    for i in range(x.shape[1]):
        xi = x[:, i]
        mine.compute_score(xi, y)
        mic_all.append(mine.mic())

    # 找出8个最大的
    best_n = []
    best_n_mic = []
    for i in range(n):
        best_position = np.nanargmax(mic_all)
        best_n.append(best_position)
        best_n_mic.append(copy.deepcopy(mic_all[best_position]))
        mic_all[best_position] = np.nan

    print('Found', n, 'features with largest MIC, whose positions are:')
    print(best_n)
    print()
    print('The MIC of these features are:')
    print(best_n_mic)
    print()

    best_features = x[:, best_n]
    print('Shape of features selected:', best_features.shape)
    best_features_with_label = pd.DataFrame(
        np.concatenate([best_features, y.reshape(len(y), 1)], axis=1))

    out_path = out_path + 'mic_best_' + str(n) + '.csv'
    best_features_with_label.to_csv(out_path, header=None, index=None)
コード例 #8
0
def get_mic(x, y):
    #get maximum information coefficient and pearson r value
    r = np.corrcoef(x, y)[0, 1]
    mine = MINE(alpha=0.4, c=15, est='mic_e')
    mine.compute_score(x, y)
    mic = mine.mic()
    return mic, r
コード例 #9
0
ファイル: Preprocess.py プロジェクト: wang-mumu/open-code
def micfliter(data,rate):
  
    """
        MIC feture selection function
        Arguments: 
          data:  Pandas DataFrame of 
          rate:  Float in range(0,1)
        Return: 
          List of to drop
    """

    m = MINE()  
    micmatrix = []
    for colx in data.columns:
        micResult = []
        for coly in data.columns:
            m.compute_score(np.array(data[coly]), np.array(data[colx])) 
            micResult.append(m.mic())
        micmatrix.append(micResult)
        
    micmatrix = pd.DataFrame(micmatrix,columns=data.columns)
    upper = micmatrix.where(np.triu(np.ones(micmatrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column]>rate)]
    
    return to_drop
コード例 #10
0
def mutual_infomation_rank(col_names, X, y, topK=10):
    '''
    互信息特征重要性检测

    :param col_names: 特征名,list
    :param X: 特征矩阵,numpy 2D array
    :param y: 标签向量,numpy array
    :param topK: 输出前k个变量
    :return: 排序后的特征dataframe,含权重和置信度
    '''

    # 因为互信息计算较慢,进行采样后再计算
    original_size = len(y)
    sampling_size = 2000 if original_size > 2000 else original_size
    X, y = resample(X, y, random_state=0, n_samples=sampling_size)

    mine = MINE(alpha=0.6, c=15, est="mic_approx")

    scores = []
    for i in range(0, len(col_names)):
        mine.compute_score(X[:, i], y)
        scores.append(mine.mic())

    result_df = pd.DataFrame({'name': col_names, 'mutual_information': scores})

    result_df = result_df[['name', 'mutual_information'
                           ]].sort_values('mutual_information',
                                          ascending=False)

    print "size={m} sampling={s} features={n} top{k} rank for MINE testing:" \
        .format(m=original_size, s=sampling_size, n=len(col_names), k=topK)
    print result_df.head(topK)
    return result_df
コード例 #11
0
def mic_method(data_x, data_y, feat_labels, mic_threshold, is_split=1):
    # 缺失值填充
    # data_x = data_x.fillna(data_x.mean())
    data_x = data_x.fillna(0)
    data_x = data_x.values
    # 归一化,之前必须保证没有空值,之后自动变成ndarray
    scaler = MinMaxScaler()
    data_x = scaler.fit_transform(data_x)
    # dataframe变成没有标签的ndarray,以便可以输入模型
    data_y = data_y.values

    if is_split == 1:
        # 先把onehot列单独拿出来
        # onehot_data_x_left = data_x[:, :30]
        data_x_mid = data_x[:, 30:454]
        # onehot_data_x_right = data_x[:, 454:]
    else:
        data_x_mid = data_x

    # 最大信息系数法,注意,这个是针对分类问题使用的
    # 选择K个最好的特征,返回选择特征后的数据
    m = MINE()
    x = np.random.uniform(-1, 1, 10000)
    # m.compute_score(x, x ** 2)
    m.compute_score(data_x_mid, data_y)
    print(m.mic())
    xxx = SelectKBest(chi2, k=mic_threshold).fit(data_x_mid, data_y)
    selected_data_x = SelectKBest(chi2, k=mic_threshold).fit_transform(
        data_x_mid, data_y)
    return selected_data_x, data_y
コード例 #12
0
    def mutual_information(self,
                           X,
                           Y,
                           title=None,
                           nbins_X=50,
                           nbins_Y=50,
                           noise_sigma='all'):
        #import pdb; pdb.set_trace()
        no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y)))
        Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X)
        Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y)
        s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X))
        s.calculate_entropies()

        # MINE
        mine = MINE()
        mine.compute_score(X.flatten(), Y.flatten())

        # Linear regression
        slope, intercept, r, p, stderr = \
                scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx])

        #import pdb; pdb.set_trace()
        if title is not None:
            print(title)
        print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" %
              (noise_sigma, mine.mic(), s.I(), r**2, p, slope))
def mic(points):
    points = np.transpose(points)
    mine = MINE()
    mine.compute_score(points[0], points[1])
    Mic =  mine.mic()
    del points
    return Mic
コード例 #14
0
def MicEvaluate(dataX, dataY, name, pre_indices):
    '''
    计算每一个条件属性与决策属性之间的最大信息系数
    :param dataX:
    :param dataY:
    :param name:
    :return:
    '''
    dataY = dataY.reshape(1, -1)[0]
    nFeatures = len(dataX[0])
    print("输入特征数为:", nFeatures)
    coorArray = [] * nFeatures
    mine = MINE(alpha=0.6, c=15)
    for i in range(0, nFeatures):
        l = [x[i] for x in dataX]
        mine.compute_score(l, dataY)
        temp = mine.mic()
        coorArray.append(abs(temp))
    print("上一层留下的每个特征的最大互信息系数", coorArray)
    coorIndex = np.argsort(coorArray)
    coorIndex_ = []
    #返回最初的特征索引
    for i in coorIndex:
        coorIndex_.append(pre_indices[i])
    coorArray = np.array(coorArray)
    print("MIC相关系数:")

    print("特征:", dict(zip(name[coorIndex_], coorArray[coorIndex])))
    name_coorArray = dict(zip(name[coorIndex_], coorArray[coorIndex]))
    return coorIndex_, coorArray, name_coorArray
コード例 #15
0
def calMIC(data):
    for i in range(5):
        mine = MINE(alpha=0.6, c=15)
        miles = data[data.veh == (i + 2)].iloc[:, 1]
        weight = data[data.veh == (i + 2)].iloc[:, 2]
        mine.compute_score(miles, weight)
        print("Without noise:", "MIC", mine.mic())
コード例 #16
0
def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) :
    mic_map = {};
    for dataFileName in dataFileArray :
        if data_mark is None:
            data_mark = DATA_MARK;
        _fileName = os.path.join(data_mark, dataFileName);
        student_data,headerArray = load_data_from_file(_fileName);

        _score_map = get_final_score_map();
        _score_array = [];
        for _student_record in student_data:
            _score_array.append(_score_map[_student_record[0]]);

        featureCount = headerArray.__len__() - 1;

        if(neadNorm):
            _score_array =normizeDataSet(_score_array);

        #计算皮尔森相关系数 并输出成markdown形式
        m = MINE()
        for index in range(1,featureCount+1) :
            dataArray = getOneColumn(student_data,index);
            if (neadNorm):
                dataArray = normizeDataSet(dataArray);
            m.compute_score(dataArray,_score_array);
            mic_map[headerArray[index]] = m.mic();

    sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True);
    threhold = np.mean(list(mic_map.values()));
    for header,value in sorted_list:
        if value > threhold:
            print(header,value)
コード例 #17
0
def get_correlation(dataset, target, features=set([])):
    if target is None:
        raise ValueError('corr() need target value')
    if not isinstance(dataset, pd.DataFrame):
        dataset = pd.DataFrame(dataset)
    if not features:
        features = set(dataset.columns)
    numerical = {}
    text = {}
    num_types = (np.dtype('float64'), np.dtype('int64'), np.dtype('bool'))
    target = dataset[target]
    mine = MINE()
    for col in features:
        if dataset.dtypes[col] in num_types:
            if dataset.dtypes[col] is np.dtype('bool'):
                dataset[col] = dataset[col].astype(int, copy=False)
            mine.compute_score(dataset[col], target)
            numerical[col] = mine.mic()
        else:
            text[col] = np.nan
    return {
        'numerical':
        dict(sorted(numerical.items(), key=lambda d: d[1], reverse=True)),
        'object':
        dict(sorted(text.items(), key=lambda d: d[1], reverse=True))
    }
コード例 #18
0
def compute_MIC(x, y, alpha=0.6, c=15, all_metrics=False):
    from minepy import MINE
    mine = MINE(alpha, c)
    mine.compute_score(x, y)
    if all_metrics:
        return mine.mic(), mine
    else:
        return mine.mic()
コード例 #19
0
    def MIC(self,x,y):
        mine = MINE(alpha=0.6,c=15,est="mic_approx")
        mine.compute_score(x,y)
        return mine.mic()

    
    
        
コード例 #20
0
 def mine():
     for column in tqdm(uncorrelated, desc="Running MINE test", dynamic_ncols=True,
                        leave=False):
         mine = MINE()
         mine.compute_score(epigenomes[column].values.ravel(), labels.values.ravel())
         score = mine.mic()
         if score >= correlation_threshold:
             uncorrelated.remove(column)
コード例 #21
0
def MIC():
    train_x, train_y = MLdata('new_dataset.csv')
    mic = MINE()
    l = train_x.shape[1]
    print(l)
    for i in range(l):
        mic.compute_score(train_x[:,i], train_y)
        print(i, mic.mic())
コード例 #22
0
ファイル: predict.py プロジェクト: wilsonact/predict
def feature_scoring(X, Y):
    names = ["x%s" % i for i in range(1, 37)]
    ranks = {}

    X = X.values[:, :]
    lr = LinearRegression(normalize=True)
    lr.fit(X, Y)
    ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

    ridge = Ridge(alpha=7)
    ridge.fit(X, Y)
    ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X, Y)
    ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

    #stop the search when 5 features are left (they will get equal scores)
    rfe = RFE(lr, n_features_to_select=5)
    rfe.fit(X, Y)
    ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

    rf = RandomForestRegressor()
    rf.fit(X, Y)
    ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

    f, pval = f_regression(X, Y, center=True)
    ranks["Corr."] = rank_to_dict(f, names)

    print('startMIC')
    mine = MINE()
    mic_scores = []

    for i in range(X.shape[1]):
        mine.compute_score(X[:, i], Y)
        m = mine.mic()
        mic_scores.append(m)
        print(i)
    ranks["MIC"] = rank_to_dict(mic_scores, names)

    print('finish MIc')

    r = {}
    for name in names:
        r[name] = round(
            np.mean([ranks[method][name] for method in ranks.keys()]), 2)
    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    methods.append("Mean")

    print("\t%s" % "\t".join(methods))
    for name in names:
        print("%s\t%s" % (name, "\t".join(
            map(str, [ranks[method][name] for method in methods]))))
コード例 #23
0
def mic(x, y):
    """
    :param x:
    :param y:
    :return:
    """
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)
コード例 #24
0
def mic_hq(X, y, cut=0.2):
    from minepy import MINE
    m = MINE()
    nf = X.shape[1]
    subs = np.array([False] * nf)
    for i in range(nf):
        m.compute_score(X[:,i], y)
        subs[i] = (m.mic() < cut)
    return(subs)
コード例 #25
0
 def _entropy_select(self):
     """
     互信息法
     """
     m = MINE()
     mic_array = np.zeros(self.sample_num)
     for i, x in enumerate(self.x.T):
         m.compute_score(x, self.y)
         mic_array[i] = m.mic()
     self._get_top_k_ids(mic_array)
コード例 #26
0
def toolkit_mic(arr0, arr1, alpha=0.6, c=15):
    """MIC"""

    np_temp0 = np.array(arr0)
    np_temp1 = np.array(arr1)

    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mine.compute_score(np_temp0, np_temp1)

    return mine.mic()
コード例 #27
0
def MIC(X, y):
    mics = []
    for i in range(X.shape[1]):
        m = MINE()
        m.compute_score(X[:, i], y)
        mic = m.mic()
        mics += [
            mic,
        ]
    return mics
コード例 #28
0
ファイル: calcMINE.py プロジェクト: YukiMitsuta/shs4py
def main():
    """
    
    """
    #mine = MINE()
    #hsic_lasso = HSICLasso()
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    root = 0

    collist = []
    for colname in glob.glob("./run*/COL*"):
        for line in open(colname):
            if "#" in line:
                continue
            line = line.split()
            t = float(line[0])
            if t < 10000.0:
                continue
            elif 50000.0 < t:
                break
            collist.append(line)
        #break
    #MINEs, TICs =  pstats(collist)
    #print(TICs)
    for i in range(len(collist[0])-1):
        for j in range(i+1, len(collist[0])-1):
        #for j in range(i+1, i + 5):
            miclist = []
            ticlist = []
            for _ in range(10):
            #if True:
                colpart = random.sample(collist, 10000)
                #colpart = collist
                x = np.array([a[i+1] for a in colpart], dtype=float)
                y = np.array([a[j+1] for a in colpart], dtype=float)
                #xy = np.array([x,y])
                #mine = MINE()
                mine = MINE(est="mic_e")
                mine.compute_score(x,y)
                miclist.append(mine.mic())
                ticlist.append(mine.tic())
            #miclist = comm.gather(mine.mic(), root=0)
            #ticlist = comm.gather(mine.tic(), root=0)

            #hsic_lasso.input(xy, np.array([0,1]))
            #hsic_lasso.input(np.array([[1, 1, 1], [2, 2, 2]]), np.array([0, 1]))
            #hsic_lasso.regression(5)
            #hsic_lasso.classification(10)
            #print(hsic_lasso.dump())
            if rank == root:
                print("%s,%s, %s, %s"%(i,j,np.mean(miclist),np.mean(ticlist)), flush = True)
                with open("./minedata.csv", "a") as wf:
                    wf.write("%s, %s, %s, %s\n"%(i,j,np.mean(miclist),np.mean(ticlist)))
コード例 #29
0
ファイル: distance.py プロジェクト: khemlalnirmalkar/halla
def mic (X, Y):
    new_X , new_Y = remove_pairs_with_a_missing(X, Y)
    try:
        import minepy
        from minepy import MINE
    except (ImportError):
        sys.exit("CRITICAL ERROR:2 Unable to import minepy package." + 
            " Please check your install.") 
    mine = MINE(alpha=0.6, c=15)
    mine.compute_score(new_X , new_Y)
    return mine.mic(),  None
コード例 #30
0
def MIC(features, labels):
    mine = MINE()
    mic_scores = []

    labels = labels.flatten()
    for i in range(features.shape[1]):
        mine.compute_score(features[:, i], labels)
        m = mine.mic()
        mic_scores.append(m)

    return mic_scores