Python MINEの例、minepy.MINE Pythonの例

コード例 #1

0

ファイルを表示

ファイル: micAnalysis.py プロジェクト: subrata4096/regression

def chooseIndependantInputVariables(inArr):
	#print inArr
	selected_input_indexes = []
	for i in range(inArr.shape[1]):
		doSelect = True
		for j in range(i):

			#Subrata for now choosing all inputs! commentout "break" later when you need it.
			#break  # comment out this to select only independant inputs

			if(i == j):
				return
			x = inArr[:,i]
			y = inArr[:,j]
			#inputFeatureName1 = getInputParameterNameFromColumnIndex(i)
			inputFeatureName1 = getInputParameterNameFromFeatureIndex(i)
			#inputFeatureName2 = getInputParameterNameFromColumnIndex(j)
			inputFeatureName2 = getInputParameterNameFromFeatureIndex(j)
                	#print "x: ", x
                	x_scaled = preprocessing.scale(x)
                	y_scaled = preprocessing.scale(y)
                	#print "x: ", x_scaled
                	#print "targetArr: ", targetArr 
                	mine = MINE(alpha=0.6, c=15)
                	mine.compute_score(x_scaled, y_scaled)
			print "Correlation between ",inputFeatureName1,inputFeatureName2, " is ", mine.mic()  
			if(float(mine.mic()) >= 0.99):
				doSelect = False
				print "\n ***** ==> will NOT select ", inputFeatureName1, " as it correlates with ", inputFeatureName2, "\n" 
		#end for
		if(doSelect):
			selected_input_indexes.append(i)

	return selected_input_indexes

コード例 #2

0

ファイルを表示

ファイル: errorAnalysis.py プロジェクト: subrata4096/regression

def calculateCorrelationBetweenVectors(x,y):
	#x = scipy.array([-0.65499887,  2.34644428, 3.0])
 	#y = scipy.array([-1.46049758,  3.86537321, 21.0])
	#The Pearson correlation coefficient measures the linear relationship between two datasets. 
	#Strictly speaking, Pearson correlation requires that each dataset be normally distributed. 
	#correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. 
	#Correlations of -1 or +1 imply an exact linear relationship. 

	#The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. 
	#The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so.
	#print "X = " , x, "\nY = ", y
	#corr, p_value = pearsonr(x, y)
	commonSize = 0
	if(len(x) < len(y)):
		commonSize = len(x)
	else:
		commonSize = len(y)
	x_sorted = np.sort(x)
	y_sorted = np.sort(y)
	
	x_sorted = x_sorted[ : (commonSize - 1)]
	y_sorted = y_sorted[ : (commonSize - 1)]
	
	x_scaled = preprocessing.scale(x_sorted)
	y_scaled = preprocessing.scale(y_sorted)

	mine = MINE(alpha=0.6, c=15)
        mine.compute_score(x_scaled, y_scaled)	
	corr = float(mine.mic())
	#return 
	#print "correlation :", corr
	return corr

コード例 #3

0

ファイルを表示

ファイル: rat.py プロジェクト: adrinjalali/Network-Classifier

def _evaluate_single(data, target_feature):
    mine = MINE(alpha=0.4, c=15)
    MICs = list()
    for i in range(data.shape[1]):
        mine.compute_score(target_feature,data[:,i])
        MICs.append(mine.mic())
    return(MICs)

コード例 #4

0

ファイルを表示

ファイル: CorrelationFinder.py プロジェクト: RepublicMaster/netzob

    def execute(self, symbol):
        """
        :param symbol: the symbol in which we are looking for correlations
        :type symbol: :class:`netzob.Common.Models.Vocabulary.AbstractField.AbstractField`
        """

        (attributeValues_headers, attributeValues) = self._generateAttributeValuesForSymbol(symbol)
        symbolResults = []

        # MINE computation of each field's combination
        for i, values_x in enumerate(attributeValues[:-1]):
            for j, values_y in enumerate(attributeValues[i + 1 :]):
                mine = MINE(alpha=0.6, c=15)
                mine.compute_score(numpy.array(values_x), numpy.array(values_y))
                mic = round(mine.mic(), 2)
                if mic > float(self.minMic):
                    # We add the relation to the results
                    (x_fields, x_attribute) = attributeValues_headers[i]
                    (y_fields, y_attribute) = attributeValues_headers[j]
                    # The relation should not apply on the same field
                    if len(x_fields) == 1 and len(y_fields) == 1 and x_fields[0].id == y_fields[0].id:
                        continue
                    pearson = numpy.corrcoef(values_x, values_y)[0, 1]
                    if not numpy.isnan(pearson):
                        pearson = round(pearson, 2)
                    relation_type = self._findRelationType(x_attribute, y_attribute)
                    self._debug_mine_stats(mine)
                    self._logger.debug(
                        "Correlation found between '"
                        + str(x_fields)
                        + ":"
                        + x_attribute
                        + "' and '"
                        + str(y_fields)
                        + ":"
                        + y_attribute
                        + "'"
                    )
                    self._logger.debug("  MIC score: " + str(mic))
                    self._logger.debug("  Pearson score: " + str(pearson))
                    id_relation = str(uuid.uuid4())
                    symbolResults.append(
                        {
                            "id": id_relation,
                            "relation_type": relation_type,
                            "x_fields": x_fields,
                            "x_attribute": x_attribute,
                            "y_fields": y_fields,
                            "y_attribute": y_attribute,
                            "mic": mic,
                            "pearson": pearson,
                        }
                    )
        return symbolResults

コード例 #5

0

ファイルを表示

ファイル: MINE.py プロジェクト: wawltor/Preudential

def mine_features(data,features):
    print '...'
    for X_hat_idx in features:
        features.remove(X_hat_idx)
        subset =  features
        for xi_idx in subset:
            m = MINE()
            X_hat = data[X_hat_idx].values
            xi = data[xi_idx].values
            m.compute_score(X_hat,xi)
            I_X_hat_xi = m.mic()
            if I_X_hat_xi>0.10:
                print 'I({X_hat_idx},{xi_idx}): {I_X_hat_xi}'.format(X_hat_idx=X_hat_idx,xi_idx=xi_idx,I_X_hat_xi=I_X_hat_xi)

コード例 #6

0

ファイルを表示

ファイル: feature_selection.py プロジェクト: stylianos-kampakis/ADAN

def calcMICReg(df,target,col):
    """
    
    """
    m=MINE()
    if df[col].dtype.name=="category":
        g=df.groupby(by=[col])['_target_variable_'].mean()
        g=g.to_dict()
        X=df[col].values
        X=[g[x] for x in X]    
    else:
        X=df[col].values
    m.compute_score(X, target)
    
    return {col:m.mic()}

コード例 #7

0

ファイルを表示

ファイル: graph_analysis_util.py プロジェクト: yallapragada/network_analysis

def perform_mic_1p(p_sequences, p, cutoff=0.5, out_folder=''):
    p_sequences_t = transpose(array([list(z) for z in p_sequences])).tolist()
    mic_scores = []
    for counter1 in range(0, len(p_sequences_t) - 1):
        for counter2 in range(counter1 + 1, len(p_sequences_t)):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(p_sequences_t[counter1], p_sequences_t[counter2])
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p+'_'+str(counter1+1)
                mic_score['y'] = p+'_'+str(counter2+1)
                mic_score['p1'] = p
                mic_score['p2'] = p
                mic_score['weight'] = format(mine.mic(), '.3f')
                mic_scores.append(mic_score)
    write_mics_to_csv(mics=mic_scores, p1=p, p2=p, cutoff=cutoff, out_folder=out_folder)
    return mic_scores

コード例 #8

0

ファイルを表示

ファイル: sec_minepy.py プロジェクト: zhmz90/Daily

def mysubplot(x, y, numRows, numCols, plotNum,
              xlim=(-4, 4), ylim=(-4, 4)):

    r = np.around(np.corrcoef(x, y)[0, 1], 1)
    mine = MINE(alpha=0.6, c=15)
    mine.compute_score(x, y)
    mic = np.around(mine.mic(), 1)
    ax = plt.subplot(numRows, numCols, plotNum,
                     xlim=xlim, ylim=ylim)
    ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.plot(x, y, ',')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax

コード例 #9

0

ファイルを表示

ファイル: model.py プロジェクト: heyuhere/ml-exercise

  def select_feature(self, data, label, threshold=0.7):
    """
    Perform feature selection by maximum information coefficient that can capture both linear and non-linear relationships.
    """
    selected = []

    from minepy import MINE
    mine = MINE()

    for i, col in enumerate(data):
      print 'feature selection: %d/%d %s' % (i, data.shape[1], col)
      mine.compute_score(data[col], label)
      if mine.mic() > threshold:
        selected.append(col)

    print '%d out of %d features were selected' % (len(selected), data.shape[1])

    return selected

コード例 #10

0

ファイルを表示

ファイル: feature_covariance.py プロジェクト: Jewelryland/YelpRecSys

def get_corrcoef(X):
    div = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.05, random_state=0)
    for train, test in div:
        X = X[np.array(test)]
        break

    X = X.transpose()
    pcc = np.ones((X.shape[0], X.shape[0]))
    m = MINE()
    # feat_groups = [[0], [1, 2, 3], [4, 5, 7, 8, 9, 10], [6],
    #                list(range(11, 24)), list(range(24, 29)), list(range(29, 34))]
    t = time()
    for i in range(0, 1):
        for j in range(1, 20):
            m.compute_score(X[i], X[j])
            pcc[i, j] = pcc[j, i] = m.mic()  # np.corrcoef(X[i], X[j])[0, 1]
            print(i, j, pcc[i, j], time()-t)
    np.savetxt(os.path.join(CODE_PATH, 'feat_sim_pcc_2.csv'), pcc, fmt='%.3f', delimiter=',')
    print('Done with computing PCC,', 'using', time()-t, 's')

コード例 #11

0

ファイルを表示

def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5):
    mic_scores = []
    p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist()
    p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist()

    for idx1, record1 in enumerate(p1_sequences_t):
        for idx2, record2 in enumerate(p2_sequences_t):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(record1, record2)
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p1+'_'+str(idx1+1)
                mic_score['y'] = p2+'_'+str(idx2+1)
                mic_score['p1'] = p1
                mic_score['p2'] = p2
                mic_score['weight'] = mine.mic()
                mic_scores.append(mic_score)

    #print('computed ', len(mic_scores), ' mics for ', p1, p2, 'for cutoff ', cutoff)
    return mic_scores

コード例 #12

0

ファイルを表示

ファイル: graph_analysis_util.py プロジェクト: yallapragada/network_analysis

def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5):
    mic_scores = []
    p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist()
    p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist()

    for idx1, record1 in enumerate(p1_sequences_t):
        for idx2, record2 in enumerate(p2_sequences_t):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(record1, record2)
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p1+'_'+str(idx1+1)
                mic_score['y'] = p2+'_'+str(idx2+1)
                mic_score['p1'] = p1
                mic_score['p2'] = p2
                mic_score['weight'] = format(mine.mic(), '.3f')
                mic_scores.append(mic_score)

    write_mics_to_csv(mics=mic_scores, p1=p1, p2=p2, cutoff=cutoff)
    return mic_scores

コード例 #13

0

ファイルを表示

ファイル: base.py プロジェクト: MattNolanLab/ei-attractor

    def mutual_information(self, X, Y, title=None, nbins_X=50, nbins_Y=50,
            noise_sigma='all'):
        #import pdb; pdb.set_trace()
        no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y)))
        Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X)
        Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y)
        s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X))
        s.calculate_entropies()

        # MINE
        mine = MINE()
        mine.compute_score(X.flatten(), Y.flatten())

        # Linear regression
        slope, intercept, r, p, stderr = \
                scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx])

        #import pdb; pdb.set_trace()
        if title is not None:
            print(title)
        print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" %
                (noise_sigma, mine.mic(), s.I(), r**2, p, slope))

コード例 #14

0

ファイルを表示

ファイル: feature_analysis.py プロジェクト: Kali-Hac/APMCM_2017_A

def run_feature_selection():
    X, Y = get_dataset()
    X = np.array(X)
    Y = np.array(Y)
    # print len(X[0])
    # names = ["x%s" % i for i in range(1, 8)]
    names = [
        'Age', 'Sex', 'Sleep quality', 'Sleep latency', 'Sleep time',
        'Sleep efficiency', 'Sleep disorder', 'Hypnagogue',
        'Daytime dyfunction'
    ]
    # names = ['Sex', 'Sleep quality', 'Sleep latency', 'Sleep time', 'Sleep efficiency', 'Sleep disorder', 'Hypnagogue', 'Daytime dyfunction']
    lr = LinearRegression(normalize=True)
    lr.fit(X, Y)
    ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

    ridge = Ridge(alpha=7)
    ridge.fit(X, Y)
    ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X, Y)
    ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

    # stop the search when 5 features are left (they will get equal scores)
    rfe = RFE(ridge, n_features_to_select=5)
    rfe.fit(X, Y)
    ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

    # rf = RandomForestRegressor()
    # rf.fit(X, Y)
    # ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

    # f, pval = f_regression(X, Y, center=True)
    # # print len(f),len(names)
    # ranks["Corr."] = rank_to_dict(f, names)
    mine = MINE()
    mic_scores = []
    for i in range(X.shape[1]):
        mine.compute_score(X[:, i], Y)
        m = mine.mic()
        mic_scores.append(m)

    ranks["MIC"] = rank_to_dict(mic_scores, names)

    r = {}
    for name in names:
        r[name] = round(
            np.mean([ranks[method][name] for method in ranks.keys()]), 2)

    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    methods.append("Mean")

    print("\t%s" % "\t".join(methods))
    for name in names:
        print("%s\t%s" % (name, "\t".join(
            map(str, [ranks[method][name] for method in methods]))))
    return ranks

コード例 #15

0

ファイルを表示

def MIC(data):
    mine = MINE(alpha=0.6, c=15)  #alpha:网格分辨率限制，m*n<B,B=n^alpha
    data_mic = MIC_matirx(data, mine)
    print(data_mic)
    return data_mic

コード例 #16

0

ファイルを表示

ファイル: Feature_tfidf_TopicClassifer.py プロジェクト: liguoyu1/python

def mic(x, y):
	m = MINE()
	print x
	print y
	m.compute_score(x, y)
	return (m.mic(), 0.5)

コード例 #17

0

ファイルを表示

ファイル: DataAnalyzer.py プロジェクト: bogan27/TitanicSurvival

 def interactionV(self, data):
     from minepy import MINE
     m = MINE()
     m.compute_score(data, x**2)
     print(m.mic())

コード例 #18

0

ファイルを表示

ファイル: 1-MIC计算互信息值.py プロジェクト: zouhuigang/ML-PYTHON3-PROJECTS

from minepy import MINE
import numpy as np

m = MINE()
x = np.random.uniform(-1, 1, 10000)
m.compute_score(x, x ** 2)
print(m.mic())  # 1.0000000000000009

import numpy as np
from scipy.stats import pearsonr

np.random.seed(0)
size = 300
x = np.random.normal(0, 1, size)
print("Lower noise", pearsonr(x, x + np.random.normal(0, 1, size)))
print("Higher noise", pearsonr(x, x + np.random.normal(0, 10, size)))
'''
Lower noise (0.7182483686213841, 7.32401731299835e-49)
Higher noise (0.057964292079338155, 0.3170099388532475)
'''

コード例 #19

0

ファイルを表示

ファイル: StatsToolKits.py プロジェクト: henryalps/LearnAlthogrimInPython

 def get_mic(self):
     m = MINE()
     m.compute_score(self.x, self.y)
     return m.mic()

コード例 #20

0

ファイルを表示

ファイル: featureSelect.py プロジェクト: qq563902455/IJCAI_18

def micCompute(x, y):
    m = MINE()
    m.compute_score(x, y)
    return m.mic()

コード例 #21

0

ファイルを表示

ファイル: feature_selection.py プロジェクト: renewday/AutoFeature

 def calculate_mic(self, x, y):
     mine = MINE()
     mine.compute_score(x, y)
     score = mine.mic() / len(np.unique(x))
     return score

コード例 #22

0

ファイルを表示

ファイル: feature.py プロジェクト: XiaoqingWang/almsc

#!/usr/bin/env python

from datetime import datetime
from datetime import timedelta
import numpy as np
from minepy import MINE
from base import N_SERIES_DAYS, DECAY_RATE, TIME_FORMAT
from database import connect
from extract import getBorder, get_n_days, get_n_artists, get_n_series, getSeries, getSeriesRange

_mine = MINE()


def genFeatureDefination(name):
    db = connect()
    cursor = db.cursor()
    sql = 'alter table mars_tianchi_features drop column %s' % name
    try:
        cursor.execute(sql)
    except Exception, e:
        print 'ignore drop column error !!!'
    sql = 'alter table mars_tianchi_features add column (%s float)' % name
    cursor.execute(sql)

    beginXTrain = getBorder(isBegin=True, isX=True, isTrain=True)
    endXTrain = getBorder(isBegin=False, isX=True, isTrain=True)
    beginXTest = getBorder(isBegin=True, isX=True, isTrain=False)
    endXTest = getBorder(isBegin=False, isX=True, isTrain=False)
    beginYTrain = getBorder(isBegin=True, isX=False, isTrain=True)
    endYTrain = getBorder(isBegin=False, isX=False, isTrain=True)
    n_X_days = get_n_days(isX=True, isTrain=True)

コード例 #23

0

ファイルを表示

ファイル: MIC_Selection.py プロジェクト: marioberges/F16-12-752

# The first line is description and should be deleted.
data = np.delete(data, 0, axis=0)

# total_heat = data[:, 908]
total_elec = data[:, 907]
# heat_areas = data[:, 829]

# consumption per area as output Y
# Y = np.divide(total_heat, heat_areas)
Y = total_elec
# print min(Y)
# print min(heat_areas)
# print data[0, 908], data[2, 829]
index = []

fo = open('MIC_results.txt', 'w')
for i in range(1, len(data[0])):
    X = data[:, i]
    mine = MINE(alpha=0.6, c=15)
    mine.compute_score(X, Y)
    # index.append(mine.mic())
    fo.write(str(mine.mic())+'/n')
    print (i+1, mine.mic())

fo.close()

コード例 #24

0

ファイルを表示

# -*- coding: utf-8 -*-
"""
Created on Thu Apr  4 18:17:29 2019

@author: Chiaki
"""
import sys
sys.path.append("D:\Python\current")

import numpy as np
import pandas as pd
import copy as npcopy
from minepy import MINE
mine = MINE(alpha=0.6, c=15, est='mic_approx')

import time

class MyPSO(object):
    """
    pop_size：鸟群规模
    factor_size：解的维度
    wmax, wmin ：惯性权值 w 的取值范围
    c1, c2：学习参数
    iter：最大迭代次数
    """
    def __init__(self,pop_size,factor_size,wmax,wmin,c1,c2,iter,data):
        self.pop_size = pop_size
        self.factor_size = factor_size
        self.wmax = wmax
        self.wmin = wmin
        self.c1 = c1

コード例 #25

0

ファイルを表示

# 金融指标及时间序列
# ==============================================================
import talib
import arch
import statsmodels
import patsy


# 机器学习、深度学习
# ==============================================================
# pip install minepy
from minepy import MINE                        # 计算相关性
import numpy as np
import numexpr
m = MINE()
x = np.random.uniform(-1, 1, 10000)
m.compute_score(x, x**2)
#print m.mic()

import pybrain
import sklearn

# pip install heampy
import heamy                                   # 用于融合模型（与sklearn联合使用）

# pip install lightgbm
import lightgbm

import xgboost

コード例 #26

0

ファイルを表示

ファイル: feature_selection.py プロジェクト: Richard-mf/machine-learning

#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X,Y)
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

#RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

#f_regression
f, pval  = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)

#MINE
mine = MINE()
mic_scores = []
for i in range(X.shape[1]):
    mine.compute_score(X[:,i], Y)
    m = mine.mic()
    mic_scores.append(m)

ranks["MIC"] = rank_to_dict(mic_scores, names)

#----statistics--out---------
r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)

methods = sorted(ranks.keys())

コード例 #27

0

ファイルを表示

xgb_7844 = pd.read_csv('xgb_7844.csv')
svm_771 = pd.read_csv('svm_771.csv')
xgb_787 = pd.read_csv('xgb_787.csv')

fs = ['xgb_7844', 'svm_771', 'xgb_787']

res = []
res.append(pd.read_csv('xgb_7844.csv').score.values)
res.append(pd.read_csv('svm_771.csv').score.values)
res.append(pd.read_csv('xgb_787.csv').score.values)

cm = []
for i in range(3):
    tmp = []
    for j in range(3):
        m = MINE()
        m.compute_score(res[i], res[j])
        tmp.append(m.mic())
    cm.append(tmp)

import numpy as np
import matplotlib.pyplot as plt


def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(3)
    plt.xticks(tick_marks, fs, rotation=45)
    plt.yticks(tick_marks, fs)

コード例 #28

0

ファイルを表示

ファイル: data_analyse.py プロジェクト: Kimposs622/kaggle_project

def train_and_analyse(_X, _y, features):
	X = _X
	Y = _y
	cv_l = cross_validation.KFold(X.shape[0], n_folds=10,
								shuffle=True, random_state=1)
	ranks = {}

	lr = LinearRegression(normalize=True)
	lr.fit(X, Y)
	ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	

	ridge = RidgeCV(cv=cv_l)
	ridge.fit(X, Y)
	ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
	lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000)
	lasso.fit(X, Y)
	ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features)
	
	rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42)
	rlasso.fit(X, Y)
	ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features)
	
	rfe = RFE(lr, n_features_to_select=1)
	rfe.fit(X,Y)
	ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1)

	rf = RandomForestRegressor(n_estimators=500)
	rf.fit(X,Y)
	ranks["RF"] = rank_to_dict(rf.feature_importances_, features)

	f, pval  = f_regression(X, Y, center=True)
	ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features)

	mine = MINE()
	mic_scores = []
	for i in range(X.shape[1]):
	   mine.compute_score(X[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = rank_to_dict(mic_scores, features) 

	r = {}
	for name in features:
	    r[name] = round(np.mean([ranks[method][name] 
	                             for method in ranks.keys()]), 2)
	 
	methods = sorted(ranks.keys())
	ranks["Mean"] = r
	methods.append("Mean")
	
	ranks = pd.DataFrame(ranks)

	selection_feature = ranks[ranks.Mean > 0.12].index.values

	return ranks, selection_feature

コード例 #29

0

ファイルを表示

def MIC(a, b):  # return the MIC between a and b
    mine = MINE()
    mine.compute_score(a, b)
    MIC = mine.mic()
    # print('MIC=',MIC)
    return MIC

コード例 #30

0

ファイルを表示

ファイル: data_analyse.py プロジェクト: Jun321/kaggle_project

def train_and_analyse(_X, _y, sno, ino):
	X = _X.copy()
	Y = _y
	features = X.columns.values
	cv_l = cross_validation.KFold(X.shape[0], n_folds=5,
								shuffle=True, random_state=1)
	ranks_linear = {}
	ranks_nonlinear= {}
	ranks_path = {}
	ranks = {}

	selection_feature = []

	time_feature_1 = [
					'date2j'
					]
	time_feature_2 = [
					'day',
					'month',
					'year'
					]

	time_feature_3 = [
					'is_2012', 
					'is_2013', 
					'is_2014',
					'fall', 
					'winter', 
					'spring',
					'summer'
					]

	time_feature_4 = [
					'weekday',
					'is_weekend', 
					'is_holiday', 
					'is_holiday_weekday', 
					'is_holiday_weekend',
					]

	time_feature_5 = [
					'MemorialDay', 
					'MothersDay', 
					'BlackFridayM3',
					'BlackFriday1', 
					'NewYearsDay', 
					'IndependenceDay', 
					'VeteransDay',
					'BlackFriday2', 
					'NewYearsEve', 
					'BlackFriday3', 
					'ChristmasDay',
					'BlackFridayM2', 
					'ThanksgivingDay', 
					'Halloween', 
					'EasterSunday',
					'ChristmasEve', 
					'ValentinesDay', 
					'PresidentsDay', 
					'ColumbusDay',
					'MartinLutherKingDay', 
					'LaborDay', 
					'FathersDay', 
					'BlackFriday'
					]

	weather_feature =  [
					'high_precip', 
					'preciptotal', 
					'snowfall', 
					'high_snow',
					'avgspeed', 
					'windy', 
					'temp_missing', 
					'tavg', 
					'hot', 
					'cold', 
					'frigid',
					'thunder', 
					'snowcode', 
					'raincode'
					]
	temp = time_feature_1 + time_feature_2 + time_feature_3 + time_feature_4 + time_feature_5
	X_f1 = X[temp].values
	# lr = LinearRegression(normalize=True)
	# lr.fit(X, Y)
	# ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	
	f, pval  = f_regression(ut.get_processed_X_A(X_f1), Y, center=True)
	ranks["F_regr"] = pd.Series(rank_to_dict(np.nan_to_num(f), temp))
	# print('asd')
	# mi = mutual_info_regression(ut.get_processed_X_A(X_f1), Y)
	# mi /= np.max(mi)
	# ranks['MI'] = Pd.Series()

	mine = MINE()
	mic_scores = []
	for i in range(ut.get_processed_X_A(X_f1).shape[1]):
	   mine.compute_score(ut.get_processed_X_A(X_f1)[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = pd.Series(rank_to_dict(mic_scores, temp))
	


	# ridge.fit(X, Y)
	# ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
	# to avoid exploring the regime in which very noisy variables enter
	# the model
	# rlasso = RandomizedLasso(alpha='bic', normalize=True)
	# rlasso.fit(X_f1, Y)
	# ranks_linear["Stability"] = pd.Series(rlasso.scores_)

	# alpha_grid, scores_path = lasso_stability_path(X_f1, Y, random_state=42,
 #                                                   eps=0.00005, n_grid=500)
	# for alpha, score in zip(alpha_grid, scores_path.T):
	# 	ranks_path[alpha] = score
	# ranks_path = pd.DataFrame(ranks_path).transpose()
	# ranks_path.columns = temp
	# plt.figure()
	# ranks_path.plot()
	# plt.show()
	# selection_feature.extend(ranks_linear[ranks_linear.F_regr > 0.1].index.values.tolist())
	# selection_feature.extend(ranks_linear[ranks_linear.MIC > 0.1].index.values.tolist())
	# selection_feature.extend(ranks_linear[ranks_linear.Stability > 0.1].index.values.tolist())
#-------------------------------

	# rf = RandomForestRegressor(n_estimators=150, max_depth=4, n_jobs=4, random_state=1)
	rf = ut.get_regression_model('RandomForest', 0)
	scores = []
	for i in range(X_f1.shape[1]):
	 score = cross_val_score(rf, X_f1[:, i:i+1].astype(float), Y, scoring="r2", cv=ShuffleSplit(len(X_f1), 3, .3), n_jobs=2)
	 scores.append(round(np.mean(score), 3))

	ranks['RF'] = pd.Series(rank_to_dict(np.abs(scores), temp)) 

	ranks = pd.DataFrame(ranks)
	print(ranks)
	selection_feature.extend(ranks[ranks.RF > 0.1].index.values.tolist())
	selection_feature.extend(ranks[ranks.MIC >= 0.1].index.values.tolist())
	selection_feature.extend(ranks[ranks.F_regr >= 0.1].index.values.tolist())
#-------------------------------
	selection_feature = list(set(selection_feature))
	print(selection_feature)
	# ridge = RidgeCV(cv=cv_l)
	# rfe = RFE(ridge, n_features_to_select=1)
	# rfe.fit(X[selection_feature],Y)
	# ranks["RFE"] = pd.Series(rank_to_dict(np.array(rfe.ranking_).astype(float), selection_feature, order=1))
	# ranks = pd.DataFrame(ranks)
	# print(ranks)
	# r = {}
	# for name in features:
	#     r[name] = round(np.mean([ranks[method][name] 
	#                              for method in ranks.keys()]), 2)
	 
	# methods = sorted(ranks.keys())
	# ranks["Mean"] = r
	# methods.append("Mean")

	path = 'Analyse/store_{}/'.format(sno)
	mkdir_p(path)
	path += 'item_{}_(pair_analyse)'.format(ino)
	ranks.to_pickle(path)

	path += '.png'
	p.clf()
	p.cla()
	plt.figure(figsize=(16, 26))
	ranks.plot.barh(stacked=True)
	p.savefig(path, bbox_inches='tight', dpi=300)
	plt.close()

	return ranks, selection_feature

コード例 #31

0

ファイルを表示

# and all POS samples are placed together before the NEG ones
X = np.array(df.values).T
# y, len(classes) bits,
# with 1 representing 'POS' and 0 representing 'NEG'
y = np.array([1] * pos_num + [0] * neg_num)

# data pre-processing
X = preprocessing.normalize(X)

# check availability of the output path
if not os.path.exists(output_path):
    os.mkdir(output_path)

# MIC calculation
if ft_num_limit > 500:  # cut down the feature number to below 500 by MIC calculation
    mine = MINE()
    mic_scores = []
    for i in range(ft_num_limit):
        mine.compute_score(X[:, i], y)
        mic_scores.append(mine.mic())
    top_fts_mic = sorted(list(zip(range(ft_num_limit), mic_scores)),
                         key=operator.itemgetter(1),
                         reverse=True)
    top_mic_pos = [x[0] for x in top_fts_mic[0:initial_ft_num]]
else:
    top_mic_pos = list(range(initial_ft_num))

# preprocessing end, record the time cost
preprocess_time = time.time()

# =========================================

コード例 #32

0

ファイルを表示

import pandas as pd
import numpy as np
from minepy import MINE

df1 = pd.read_csv("/home/kei/document/experiments/ICTH2019/SY/UniSY2.csv")
df2 = pd.read_csv("/home/kei/document/experiments/ICTH2019/SK/UniSK1.csv")
#columns = ["LSholderX","LSholderY","LSholderZ"]
columns = ["LSholderX"]
X = df1[columns].values.tolist()
Y = df2[columns].values.tolist()
mine = MINE()
print(X.shape)
mine.compute_score(X, Y)
print(mine.mic())

コード例 #33

0

ファイルを表示

ファイル: FeatureSelectUtils.py プロジェクト: nongfang55/review

    def demo8():
        np.random.seed(0)
        size = 750
        X = np.random.uniform(0, 1, (size, 14))
        Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 + 10 * X[:, 3]
             + 5 * X[:, 4] + np.random.normal(0, 1))
        X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))

        names = ["x%s" % i for i in range(1, 15)]

        ranks = {}

        def rank_to_dict(ranks, names, order=1):
            minmax = MinMaxScaler()
            ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0]
            ranks = map(lambda x: round(x, 2), ranks)
            return dict(zip(names, ranks))

        lr = LinearRegression(normalize=True)
        lr.fit(X, Y)
        ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

        ridge = Ridge(alpha=7)
        ridge.fit(X, Y)
        ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

        lasso = Lasso(alpha=.05)
        lasso.fit(X, Y)
        ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)
        #
        # rlasso = RandomizedLasso(alpha=0.04)
        # rlasso.fit(X, Y)
        # ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

        # stop the search when 5 features are left (they will get equal scores)
        rfe = RFE(lr, n_features_to_select=5)
        rfe.fit(X, Y)
        ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

        rf = RandomForestRegressor()
        rf.fit(X, Y)
        ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

        f, pval = f_regression(X, Y, center=True)
        ranks["Corr."] = rank_to_dict(f, names)

        mine = MINE()
        mic_scores = []
        for i in range(X.shape[1]):
            mine.compute_score(X[:, i], Y)
            m = mine.mic()
            mic_scores.append(m)

        ranks["MIC"] = rank_to_dict(mic_scores, names)

        r = {}
        for name in names:
            r[name] = round(np.mean([ranks[method][name]
                                     for method in ranks.keys()]), 2)

        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")

        print("\t%s" % "\t".join(methods))
        for name in names:
            print("%s\t%s" % (name, "\t".join(map(str, [ranks[method][name] for method in methods]))))

コード例 #34

0

ファイルを表示

ファイル: feature_selection_sp-t_five-model.py プロジェクト: hsuanweiyang/KDD_2017

    def utilize_selection_method(self, options):
        self.parse_options(options)
        normalize_feature = self.normalize_feature(self.data_feature)
        feature_amount = len(self.data_feature[0])
        selection_result = {}
        logging.info('     Supervised Feature Selection : Start')
        if self.options['p'] == 1:
            widget = ['Calculating Pearson Correlation  : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()),
                      ' ', pb.ETA()]
            timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start()
            pearson_corr = []
            for n in range(0, feature_amount):
                tmp_pearson = pearsonr(normalize_feature[:, n], self.data_label)
                pearson_corr.append([abs(tmp_pearson[0]), n+1])
                timer.update(n)
            timer.finish()
            selection_result['pearson-correlation'] = sorted(pearson_corr, reverse=True)

        if self.options['r'] == 1:
            widget = ['Calculating Random Forest        : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()),
                      ' ', pb.ETA()]
            timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start()
            rf = RandomForestRegressor(n_estimators=20, max_depth=4)
            #rf.fit(normalize_feature, self.data_label)
            #rf_feature_score = rf.feature_importances_
            random_forest = []
            for n in range(0, feature_amount):
                score = cross_val_score(rf, normalize_feature[:, n:n+1], self.data_label, scoring="r2",
                                        cv=ShuffleSplit(len(normalize_feature), 3, .3))
                random_forest.append([round(np.mean(score), 3), n+1])
                #random_forest.append([rf_feature_score[n], n+1])
                timer.update(n)
            timer.finish()
            selection_result['random-forest'] = sorted(random_forest, reverse=True)

        if self.options['m'] == 1:
            widget = ['Calculating Mutual Information   : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()),
                      ' ', pb.ETA()]
            timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start()
            mutual_information = []
            mine = MINE()
            for n in range(0, feature_amount):
                mine.compute_score(normalize_feature[:, n], self.data_label)
                mutual_information.append([mine.mic(), n+1])
                timer.update(n)
            timer.finish()
            selection_result['mutual-information'] = sorted(mutual_information, reverse=True)

        if self.options['c'] == 1:
            widget = ['Calculating Chi Squire           : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()),
                      ' ', pb.ETA()]
            timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start()
            chi_squire = []
            compute_chi2 = chi2(normalize_feature, self.data_label)[0]
            for n in range(0, feature_amount):
                chi_squire.append([compute_chi2[n], n+1])
                timer.update(n)
            timer.finish()
            selection_result['chi-squire'] = sorted(chi_squire, reverse=False)

        if self.options['k'] == 1:
            widget = ['Calculating Kendall Correlation  : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()),
                      ' ', pb.ETA()]
            timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start()
            kendall_correlation = []
            for n in range(0, feature_amount):
                tmp_kendall = kendalltau(normalize_feature[:, n], self.data_label)
                kendall_correlation.append([tmp_kendall[0], n+1])
                timer.update(n)
            timer.finish()
            selection_result['kendall-correlation'] = sorted(kendall_correlation, reverse=True)

        if self.options['s'] == 1:
            widget = ['Calculating Spearman Correlation : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()),
                      ' ', pb.ETA()]
            timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start()
            spearman_corr = []
            for n in range(0, feature_amount):
                tmp_spearman = spearmanr(normalize_feature[:, n], self.data_label)
                spearman_corr.append([abs(tmp_spearman[0]), n+1])
                timer.update(n)
            timer.finish()
            selection_result['spearman-correlation'] = sorted(spearman_corr, reverse=True)

        if self.options['f'] == 1:
            logging.info('   -----Calculating Fisher score---- ')
            f_score = fisher_score.fisher_score(normalize_feature, self.data_label)
            fisher = []
            for n in range(0, feature_amount):
                fisher.append([f_score[n], n+1])
            selection_result['fisher-score'] = sorted(fisher, reverse=True)
            logging.info('   -----Calculating Fisher score---- ==> Done')
        return selection_result

コード例 #35

0

ファイルを表示

ファイル: feature_selection.py プロジェクト: dangchienhsgs/machine-learning-techniques

def maximal_information_coeffcient():
    m = MINE()
    x = np.random.uniform(-1, 1, 1000)
    m.compute_score(x, x**2 + 2)
    print m.mic()

コード例 #36

0

ファイルを表示

ファイル: feature_cmp2.py プロジェクト: jason899/machine_learning

rlasso.fit(X, Y)
ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X, Y)
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

rf = RandomForestRegressor()
rf.fit(X, Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

f, pval = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)

mine = MINE()
mic_scores = []

for i in range(X.shape[1]):
    mine.compute_score(X[:, i], Y)
    m = mine.mic()
    mic_scores.append(m)

ranks["MIC"] = rank_to_dict(mic_scores, names)
r = {}

for name in names:
    r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]),
                    2)

methods = sorted(ranks.keys())

コード例 #37

0

ファイルを表示

ファイル: mine.py プロジェクト: Cold7/MINEPY

    '-s1',
    '--serie1',
    nargs='+',
    help='<Required> first serie of numbers (usage: -s1 1 43 25 0)',
    required=True)
parser.add_argument(
    '-s2',
    '--serie2',
    nargs='+',
    help='<Required> second serie of numbers (usage: -s1 1 43 25 0)',
    required=True)
parser.add_argument(
    "-a",
    "--alpha",
    help="float (0,1.0] the exponent in B(n) n^alpha (default 0.6)",
    default="0.6")
parser.add_argument(
    "-c",
    "--clumps",
    help=
    "float (> 0) determines how many more clumps there will be than columns in every partition. Default value is 15, meaning that when trying to draw x grid lines on the x-axis, the algorithm will start with at most 15*x clumps (default 15)",
    default="15")
args = parser.parse_args()

x = args.serie1
y = args.serie2
mine = MINE(alpha=float(args.alpha), c=float(args.clumps))
mine.compute_score(x, y)

print_stats(mine)

コード例 #38

0

ファイルを表示

ファイル: micAnalysis.py プロジェクト: subrata4096/regression

def doMICAnalysisOfInputVariables(inArr, targetArr,targetName, mic_score_threshold,input_indexes_uncorrelated_features,targetQualityMap = None):
	#if(targetQuality == None):
	#	return inArr
	#print inArr
	#global inputColumnNameToIndexMapFromFile
        #global measuredColumnNameToIndexMapFromFile
        #global outputColumnNameToIndexMapFromFile

 	#print "\n\n\n doMICAnalysisOfInputVariables called \n\n"
	
	goodTargetMap = getGlobalObject("goodTargetMap")

	selected_inArr = []
	selected_inArr_indexes = []
	selected_originalColumn_indexes = []

	inColMap = getGlobalObject("inputColumnIndexToNameMapFromFile") #keys are col index and vals are names
	#selected_inArr.append([])
	#print "doMICAnalysisOfInputVariables: ", "inArr.shape: ", inArr.shape
	#print "doMICAnalysisOfInputVariables: ", "targetArr.shape: ", targetArr.shape

	numOfFeatures = 0
	try:
		#(rows,numOfFeatures) = inArr.shape
		numOfFeatures = inArr.shape[1]
	except:
		print "ERROR: \n", inArr
		exit(0)
	k = 0	
	for featureIndex in range(numOfFeatures):
	#for i in inColMap.keys():
		#x = inArr[:,i]
		#x = inArr[:,k]
		# we will choose only uncorrelated features as input
		if(featureIndex not in input_indexes_uncorrelated_features):
			continue

		x = inArr[:,featureIndex]
		#print "x: ", x
		x_scaled = preprocessing.scale(x)
		#print "x: ", x_scaled
		#print "targetArr: ", targetArr 
		mine = MINE(alpha=0.6, c=15)
		mine.compute_score(x_scaled, targetArr)
		#print getGlobalObject("inputColumnNameToIndexMapFromFile")
		#inputFeatureName = getGlobalObject("inputColumnNameToIndexMapFromFile")[i]
		#inputFeatureName = inColMap[i]
		#inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex)
		#inputFeatureName = getInputParameterNameFromColumnIndex(featureIndex)
		inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex)
		print_stats(mine,inputFeatureName,targetName,mic_score_threshold)
		if(targetQualityMap != None):
			targetQualityMap.append(float(mine.mic()))
		#l = list(x)
		#selected_inArr = np.concatenate((selected_inArr, np.array(l)), axis=0)
		#print k
		#print mine.mic()
		if(float(mine.mic()) >= mic_score_threshold):
			selected_inArr.append(x) #keep the input data column
			selected_inArr_indexes.append(k) #keep the index corresponding to that column
			colIdx = getColumnIndexFromFeatureIndex(featureIndex)
			selected_originalColumn_indexes.append(colIdx) #keep the original column index corresponding to that column
			#now add the target itself to goodTargetMap. For anomaly detection we will only use these targets
			goodTargetMap[targetName] = True
			print "----------------- selected: ", inputFeatureName, colIdx, k
			k = k + 1	
		
	selected_inArr = np.array(selected_inArr).transpose()
	#print "\n **** selected: ==== \n", selected_inArr, selected_inArr_indexes,selected_originalColumn_indexes
        return selected_inArr, selected_inArr_indexes, selected_originalColumn_indexes

コード例 #39

0

ファイルを表示

ファイル: fea_engineer.py プロジェクト: paradisees/test

def score_calculate(flag):
    # 行为特征选择的算法，列为特征的名称
    algorithm = {}
    if flag=='whole':
        tmp_sta,tmp_rf,tmp_gbdt,tmp_extra={},{},{},{}
        for n in range(10):
            #stability
            rlasso = RandomizedLasso(random_state=n)
            rlasso.fit(data, mark)
            tmp_sta = add(tmp_sta,rank_to_dict(np.abs(rlasso.scores_), names,cv=True))

            #rf
            rf = RandomForestClassifier(random_state=n)
            rf.fit(data, mark)
            tmp_rf = add(tmp_rf,rank_to_dict(rf.feature_importances_, names,cv=True))

            #GBDT
            gbdt=GradientBoostingClassifier(random_state=n)
            gbdt.fit(data, mark)
            tmp_gbdt = add(tmp_gbdt, rank_to_dict(gbdt.feature_importances_, names, cv=True))

            #Extra
            model = ExtraTreesClassifier(random_state=n)
            model.fit(data, mark)
            tmp_extra = add(tmp_extra, rank_to_dict(model.feature_importances_, names, cv=True))

        algorithm["stability"],algorithm["RF"],algorithm["GBDT"],algorithm["Extra"] \
            = tmp_sta,tmp_rf,tmp_gbdt,tmp_extra
        #MIC
        mine = MINE()
        mic_scores = []
        res=[]
        for i in range(len(data[0])):
            for num in data:
                res.append(num[i])
            mine.compute_score(res, mark)
            m = mine.mic()
            mic_scores.append(m)
            res = []
        algorithm["MIC"] = rank_to_dict(mic_scores, names)

        #线性回归
        lr = LinearRegression(normalize=True)
        lr.fit(data, mark)
        algorithm["Linear"] = rank_to_dict(np.abs(lr.coef_), names)

        #ridge
        ridgecv = RidgeCV()
        ridgecv.fit(data, mark)
        #print(ridgecv.alpha_)
        ridge = Ridge(alpha=ridgecv.alpha_)
        ridge.fit(data, mark)
        algorithm["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

        #lasso
        lassocv = LassoCV()
        lassocv.fit(data, mark)
        #print(lassocv.alpha_)
        lasso = Lasso(alpha=lassocv.alpha_)
        lasso.fit(data, mark)
        algorithm["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

        #rfe
        log=LogisticRegression()
        rfe = RFE(log, n_features_to_select=10)
        rfe.fit(data, mark)
        algorithm["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1)
        '''
        #f值检验
        f, pval = f_classif(data, mark)
        algorithm["Corr"] = rank_to_dict(f, names)
        '''
    elif flag=='extra':
        model = ExtraTreesClassifier()
        model.fit(data, mark)
        algorithm["Extra"] = rank_to_dict(model.feature_importances_, names)
    elif flag=='gbdt':
        gbdt = GradientBoostingClassifier()
        gbdt.fit(data, mark)
        algorithm["GBDT"] = rank_to_dict(gbdt.feature_importances_, names)
    elif flag=='rf':
        rf = RandomForestClassifier()
        rf.fit(data, mark)
        algorithm["RF"] = rank_to_dict(rf.feature_importances_, names)
    r = {}
    for name in names:
        r[name] = round(np.mean([algorithm[method][name] for method in algorithm.keys()]), 4)
    methods = sorted(algorithm.keys())
    algorithm["Mean"] = r
    methods.append("Mean")

    content=[]
    for name in names:
        content.append([algorithm[method][name] for method in methods])
    fea_matrix = pd.DataFrame(content,index=names)
    fea_matrix.to_csv('/Users/hhy/Desktop/fea_importance_'+flag+'.csv',encoding='utf-8-sig',header=methods)
    return algorithm

コード例 #40

0

ファイルを表示

ファイル: AxProf.py プロジェクト: uiuc-arc/AxProf

def selectInputFeatures(configs,
                        inputGenerator,
                        igparams,
                        tunedFeatures,
                        error_function,
                        runner,
                        num_runs=5):

    permutation = [1, 2, 3, 4, 5]

    print('Starting the input feature selection process')
    # Build a set of configs that only change input parameters.
    # For the remaining parameters chose at random
    newConfigs = {}
    for key in configs.keys():
        if key in tunedFeatures:
            newConfigs[key] = configs[key]
        else:
            newConfigs[key] = [random.choice(configs[key])]
    configList = extractAllConfigs(newConfigs)

    tot_runs = num_runs * len(permutation) + num_runs * len(configList)
    print('Requires {} executions'.format(tot_runs))

    # Does permuting the data affect accuracy?
    result_set = []
    perm_feat = False
    tmp_config = random.choice(configList)._asdict()
    for perm in permutation:
        configIGParams = igparams(tmp_config, 0)
        inputData = inputGenerator(*configIGParams)
        new_inputs = inputData.copy()
        random.shuffle(new_inputs)
        writeDataToFile(new_inputs, "_axprof_temp_input")

        # Averaging over a set of runs.
        error_tot = 0
        for run in range(num_runs):
            results = runner("_axprof_temp_input", tmp_config)
            error_tot += error_function(new_inputs, results['acc'])
            sys.stdout.write('.')
            sys.stdout.flush()
        result_set.append(error_tot / num_runs)
    mine = MINE()
    mine.compute_score(permutation, result_set)
    perm_mic = mine.mic()
    if perm_mic > 0.9:
        perm_feat = True

    # Testing the other features
    result_set = {}
    for config in configList:
        # Setting the number to low value for now
        for input_num in range(5):
            inpAggregate = None
            configIGParams = igparams(config._asdict(), input_num)
            inputData = inputGenerator(*configIGParams)
            writeDataToFile(new_inputs, "_axprof_temp_input")
            error_tot = 0
            for run in range(num_runs):
                sys.stdout.write('.')
                sys.stdout.flush()
                results = runner("_axprof_temp_input", tmp_config)
                error_tot += error_function(new_inputs, results['acc'])
            result_set[config] = error_tot / num_runs

    sys.stdout.write('\n')
    sys.stdout.flush()

    mics = {}
    for key in tunedFeatures:
        agg_y = {}
        for config in result_set:
            config_dict = config._asdict()
            if config_dict[key] in agg_y:
                agg_y[config_dict[key]].append(result_set[config])
            else:
                agg_y[config_dict[key]] = [result_set[config]]

        unique_x = list(agg_y.keys())
        y = []
        for x in unique_x:
            y.append(np.mean(agg_y[x]))
        mine = MINE()
        mine.compute_score(unique_x, y)
        mics[key] = mine.mic()

    # Removing the variations in features that are not important
    for key in tunedFeatures:
        if mics[key] < 0.9:
            current = configs[key]
            configs[key] = [random.choice(current)]

    # Printing the report
    print('----------------------------------------')
    print('The results of input feature selection: ')
    print("Permuting the input: (MIC: {})".format(perm_mic))
    for key in tunedFeatures:
        print("{}: (MIC: {})".format(key, mics[key]))
    print("Updated config list: ", configs)
    print('----------------------------------------')
    return configs

コード例 #41

0

ファイルを表示

ファイル: Anthony_LaRosa_Project_Milestone4.py プロジェクト: alarosa569/DSC

# feature reduction
# when i dummied the dataframe i exploded into have over 16,000 features in a data set so small there was no value
# i thought about the simpliest way to reduce the features and keep what i needed, and below is what I came up with
dummyfraud_df.drop(list(dummyfraud_df.filter(regex = 'nameOrig')), axis = 1, inplace = True)
dummyfraud_df.drop(list(dummyfraud_df.filter(regex = 'nameDest')), axis = 1, inplace = True)

#  feature selection for building the model
print("######################## Below is my Pearson Cor ########################")
print(dummyfraud_df.corr(method='pearson')["isFraud"].sort_values())

# As i raise the sample size what is becoming clear is the 'amount' feature has a high positive cor to the isFraud feature
# the challenge is that the other features do not seem to be highly cor. I want to try to use the MIC processes like
# i used on the hotel process

print("######################## Below is my MIC Analysis ########################")
mine = MINE(alpha=0.6, c=15)
fraud_columns = list(dummyfraud_df)
print(fraud_columns)
for features in fraud_columns:
    mine.compute_score(dummyfraud_df[features], dummyfraud_df["isFraud"])
    print("The MIC for feature " + str(features) + " is " + str(mine.mic()))

# # Step 21.	To split the dataset into features and target variables, first create a variable for the feature columns
feature_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
# Set X equal to the feature columns
X = dummyfraud_df[feature_cols]
# Set Y equal to the target variable
y = dummyfraud_df.isFraud
# Using the train_test_split() function, split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
# number of samples in each set

コード例 #42

0

ファイルを表示

ファイル: MICAnalysis.py プロジェクト: Ernestyj/PyProj

dfi = readWSDIndexFile(baseDir, stockCodes[i], startYear, number)
dfi['R'] = R
print np.shape(df), np.shape(dfi)

allDF = pd.concat([df, dfi], axis=1)

scaler = preprocessing.MinMaxScaler()
X_Standard = scaler.fit_transform(df)
X_Standard_T = np.transpose(X_Standard)
Xi_Standard = scaler.fit_transform(dfi)
Xi_Standard_T = np.transpose(Xi_Standard)
X_ALL_Standard = scaler.fit_transform(allDF)
X_ALL_Standard_T = np.transpose(X_ALL_Standard)
print np.shape(X_ALL_Standard_T)

mine = MINE(alpha=0.6, c=15, est="mic_approx")
mics = []
# mine.compute_score(df['Close'].values, df['R'].values); print mine.mic()
# # for i in range(0,10):
# #     mine.compute_score(X_Standard_T[i], X_Standard_T[10])
# #     mics.append(mine.mic())
# #     print i, mine.mic()
# for i in [7,9]:
#     mine.compute_score(X_Standard_T[i], X_Standard_T[10])
#     mics.append(mine.mic())
#     print i, mine.mic()
# # for i in range(0,38):
# #     mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[38])
# #     mics.append(mine.mic())
# #     print i, mine.mic()
# for i in range(0,7):

コード例 #43

0

ファイルを表示

    n_selected = n

    x_selected = x[:, J >= J_sort[n_features - n_selected]]
    return x_selected, J >= J_sort[n_features - n_selected]


n_selected = 20  #可以改动 为1 5 10 20 50 100
x_fisher_selected, fisher_boollist = fisher(x_train, y_train, n_selected)

logreg = linear_model.LogisticRegression(solver='lbfgs', max_iter=3000)
logreg.fit(x_fisher_selected, y_train)
x_fisher_test = x_test[:, fisher_boollist]
print("基于类间类内距离特征选择后，利用logstics回归的准确率是：", logreg.score(x_fisher_test, y_test))
logreg.fit(x_train, y_train)
print("直接利用logstics回归的准确率是：", logreg.score(x_test, y_test))
m = MINE()

mic = []
n_features = x_raw.shape[1]
for i in range(x_raw.shape[1]):
    x = x_train[:, i]
    m.compute_score(x, y_train)
    mic.append(m.mic())
mic_sorted = sorted(mic)
mic = np.array(mic)
x_mic_selected = x_train[:, mic >= mic_sorted[n_features - n_selected]]
mic_boollist = mic >= mic_sorted[n_features - n_selected]
x_mic_test = x_test[:, mic_boollist]
logreg.fit(x_mic_selected, y_train)
print("基于最大信息系数特征选择后，利用logstics回归的准确率是：", logreg.score(x_mic_test, y_test))
num = 0

コード例 #44

0

ファイルを表示

ファイル: select_mic.py プロジェクト: wawltor/Preudential

    def fit(self,X,y):
        # initialize phi and feature set
        # if number of features is not set, half of the features will be selected
        n = self.n
        beta = self.beta
        verbose = self.verbose
        if n ==None:
            n = int(X.shape[0]/2)

        features = np.arange(X.shape[1]).tolist()
        best_mi = -np.inf
        X_hat = 0
        for xi in features:
            m = MINE()
            m.compute_score(X[:,xi],y)
            #compute I(xi,y) and get max xi
            mi_xi_y = m.mic()
            if best_mi<mi_xi_y:
                X_hat = xi
        phi = [X_hat]
        features.remove(X_hat)
        # get paris for elements in phi and features
        while len(phi)<n:
            mi_scores = np.zeros(len(features))
            for xi_idx,xi in enumerate(features):
                m = MINE()
                m.compute_score(X[:,xi],y)
                #compute I(xi,y)
                mi_xi_y = m.mic()
                sum_mi_xi_xj = 0
                for xj in phi:
                    # compute I(xi,xj) and save for further evaluation
                    m = MINE()
                    m.compute_score(X[:,xi],X[:,xj])
                    mi_xi_xj = m.mic()
                    sum_mi_xi_xj+=mi_xi_xj
                mi_scores[xi_idx] = mi_xi_y - beta*sum_mi_xi_xj
                if verbose>=2:
                    print "mi_scores for xi:{xi}, xj:{xj} is {mi_scores}".format(xi=xi,xj=xj,mi_scores=mi_scores[xi_idx])

            X_hat = np.argmax(mi_scores)
            if verbose==1:
                print "X_hat is {X_hat}".format(X_hat=X_hat)
            X_hat = features[X_hat]
            phi.append(X_hat)
            features.remove(X_hat)
        self.phi = phi
        self.features = features

コード例 #45

0

ファイルを表示

rf = RandomForestClassifier()
rf.fit(data, mark)
algorithm["RF"] = rank_to_dict(rf.feature_importances_, names)

#GBDT
gbdt = GradientBoostingClassifier()
gbdt.fit(data, mark)
algorithm["GBDT"] = rank_to_dict(gbdt.feature_importances_, names)

#Extra
model = ExtraTreesClassifier()
model.fit(data, mark)
algorithm["Extra"] = rank_to_dict(model.feature_importances_, names)

#MIC
mine = MINE()
mic_scores = []
res = []
for i in range(len(data[0])):
    for num in data:
        res.append(num[i])
    mine.compute_score(res, mark)
    m = mine.mic()
    mic_scores.append(m)
    res = []
algorithm["MIC"] = rank_to_dict(mic_scores, names)

#线性回归
lr = LinearRegression(normalize=True)
lr.fit(data, mark)
algorithm["Linear"] = rank_to_dict(np.abs(lr.coef_), names)

コード例 #46

0

ファイルを表示

ファイル: mic.py プロジェクト: yoiooi/mic

from minepy import MINE
from scipy.stats import pearsonr, spearmanr
import nlcor
import numpy as np
import matplotlib.pyplot as plt
import time

mine = MINE()

n = 10000
m = 100  # 1% noise
start = time.time()  # 시작시간 저장

# x, y는 1차원 실수형 array
"""
random하게 데이터 생성 후 상관관계 확인
참고 - https://datascienceschool.net/view-notebook/ff367da95afc43ed8ae6ec30efc0fb9f/
"""
plt.figure(figsize=(8, 6))

#plt.subplot(231)
x1 = np.random.uniform(-50, 50, n)
#x1 = np.random.uniform(-50, 50, n)
y1 = 2 * x1**2 + np.random.uniform(-50, 50, n)
#plt.scatter(x1, y1)
#mine.compute_score(x1, y1)
#print("random - x1, y1", mine.mic())
#plt.title("MIC={0:0.3f}".format(mine.mic()))

#plt.subplot(232)
x2 = np.random.uniform(-50, 50, n)

コード例 #47

0

ファイルを表示

ファイル: cal_mic.py プロジェクト: chifa01/DataCastle-Solution

res = []
res.append(pd.read_csv("./avg_xgbs_discret_feature_5.csv").score.values)
res.append(pd.read_csv("./R_7199.csv").score.values)
res.append(pd.read_csv("./rank_feature_xgb_ensemble.csv").score.values)
res.append(pd.read_csv("./avg_xgbs_discret_feature_10.csv").score.values)
res.append(pd.read_csv("./based_on_select_rank_feature.csv").score.values)
res.append(pd.read_csv("./xgb717.csv").score.values)
res.append(pd.read_csv("./725.csv").score.values)
res.append(pd.read_csv("./svm6938.csv").score.values)

cm = []
for i in range(8):
    tmp = []
    for j in range(8):
        m = MINE()
        m.compute_score(res[i], res[j])
        tmp.append(m.mic())
    cm.append(tmp)


import numpy as np
import matplotlib.pyplot as plt


def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(8)
    plt.xticks(tick_marks, fs, rotation=45)

コード例 #48

0

ファイルを表示

ファイル: CorrMatrixPipeStats.py プロジェクト: chiewoo/CAGMon

    logfiles=glob.glob('log_dir/*')
    print "Directory exists:", log_dir
    for f in logfiles:
        os.remove(f)
    print "Removing all log files..."
else:
    print "Creating directory:", log_dir
    makedirs(log_dir)

sys.stdout = Log(sys.stdout, output_dir+'/'+mla+'_Report_'+str(GPS)+'_'+nfilename+'.log')

profile_filename=log_dir+'/'+'Profiling_'+mla+'_'+str(GPS)+'_'+nfilename+'.result'
prof=hotshot.Profile(profile_filename)
prof.start()

mine=MINE(alpha=0.6, c=15)

f_input1=np.loadtxt(input_file1+'.txt')
f_input2=np.loadtxt(input_file2+'.txt')

if len(f_input1) <  len(f_input1.T):
    f1=f_input1
else:
    f1=f_input1.T
if len(f_input2) < len(f_input2.T):
    f2=f_input2
else:
    f2=f_input2.T

Mdim=len(f1)
Mat=np.zeros((Mdim, Mdim))

コード例 #49

0

ファイルを表示

def MIC(x, y):
    mine = MINE(alpha=0.6, c=5)
    mine.compute_score(x, y)
    return mine.mic()

コード例 #50

0

ファイルを表示

ファイル: power.py プロジェクト: Astrych/minepy

def f5():
    return np.sin(16*np.pi*x) + noise * (i/n_noise) * r

def f6():
    return x**(1/4) + noise * (i/n_noise) * r

def f7():
    return (2*np.random.binomial(1, 0.5, n)-1) * (np.sqrt(1-(2*x-1)**2)) \
         + (noise/4) * (i/n_noise) * r

def f8():
    return (x > 0.5) + noise * 5 * (i/n_noise) * r


ff = [f1, f2, f3, f4, f5, f6, f7, f8]
mine = MINE(alpha=mine_alpha, c=mine_c)
mic_power = np.empty((len(ff), n_noise))
gmic_power = np.empty((len(ff), n_noise))
r2_power = np.empty((len(ff), n_noise))
np.random.seed(0)
for i in range(1, n_noise+1):
    for j, f in enumerate(ff):
        mic_null, gmic_null, r2_null = [], [], []
        mic_alt, gmic_alt, r2_alt = [], [], []

        # null hypothesis
        for k in range(1, n_null+1):
            print i, j, k
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()

コード例 #51

0

ファイルを表示

ファイル: 00-first.py プロジェクト: fxyw/PersistentJupyterNotebookLineMagicForCopyPasteInOrOut

                        x='y',
                        data=df,
                        order=df.sort_values('y', ascending=False).x.to_list(),
                        color='blue')
    for p in splot.patches:
        splot.annotate(format(p.get_width(), '.3f'),
                       (p.get_width(), p.get_y() + p.get_height() / 2.),
                       ha='center',
                       va='center',
                       xytext=(15, 0),
                       textcoords='offset points')


#mic tic dcor rdc
from minepy import MINE
m = MINE()


def mic(x, y):
    m.compute_score(x, y)
    return m.mic()


def tic(x, y):
    m.compute_score(x, y)
    return m.tic(norm=True)


from scipy.spatial.distance import pdist, squareform

コード例 #52

0

ファイルを表示

ファイル: sklearn-jasonfreak1.py プロジェクト: john7farrell/kaggle

def mic(x, y):
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)

コード例 #53

0

ファイルを表示

ファイル: mine_py.py プロジェクト: zhmz90/Daily

import numpy as np
from minepy import MINE

def print_stats(mine):
    print "MIC", mine.mic()
    print "MAS", mine.mas()
    print "MEV", mine.mev()
    print "MCN (eps=0)", mine.mcn(0)
    print "MCN (eps=1-MIC)", mine.mcn_general()

x = np.linspace(0, 1, 10)
y = np.sin(2*x) + x
print(x)
print(y)
mine = MINE(alpha=0.6, c=15)
mine.compute_score(x, y)

print "Without noise:"
print_stats(mine)
print

np.random.seed(0)
y +=np.random.uniform(-1, 1, x.shape[0]) # add some noise
mine.compute_score(x, y)

print "With noise:"
print_stats(mine)

コード例 #54

0

ファイルを表示

ファイル: power.py プロジェクト: minepy/minepy

def f5():
    return np.sin(16*np.pi*x) + noise * (i/n_noise) * r

def f6():
    return x**(1/4) + noise * (i/n_noise) * r

def f7():
    return (2*np.random.binomial(1, 0.5, n)-1) * (np.sqrt(1-(2*x-1)**2)) \
         + (noise/4) * (i/n_noise) * r

def f8():
    return (x > 0.5) + noise * 5 * (i/n_noise) * r


ff = [f1, f2, f3, f4, f5, f6, f7, f8]
mine_approx = MINE(alpha=mine_alpha, c=mine_c, est="mic_approx")
mine_e = MINE(alpha=mine_alpha, c=mine_c, est="mic_e")

mic_approx_power = np.empty((len(ff), n_noise))
mic_e_power = np.empty((len(ff), n_noise))
tic_e_power = np.empty((len(ff), n_noise))
r2_power = np.empty((len(ff), n_noise))

np.random.seed(0)
for i in range(1, n_noise+1):
    for j, f in enumerate(ff):
        print "Noise: %d, function: %d" % (i, j)

        mic_approx_null, mic_e_null, tic_e_null, r2_null = [], [], [], []
        mic_approx_alt, mic_e_alt, tic_e_alt, r2_alt = [], [], [], []

コード例 #55

0

ファイルを表示

ファイル: 1_single_feature_select.py プロジェクト: jsonbao/feature_select

size = 300
x = np.random.normal(0, 1, size)  #normal(mean,stdev,size) 高斯数
print "Lower noise", pearsonr(x, x + np.random.normal(0, 1, size))
print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))

#明显缺陷:作为特征排序机制，他只对线性关系敏感.即便两个变量具有一一对应的关系，Pearson相关性也可能会接近0
a = np.random.uniform(-1, 1, 100000)   #uniform(low,high,size) 随机数
print pearsonr(a, a**2)[0]


#1.2 互信息和最大信息系数 (Mutual information and maximal information)，[0,1]
#互信息直接用于特征选择不太方便，最大信息系数首先寻找一种最优的离散化方式，
#然后把互信息取值转换成一种度量方式，取值区间在[0，1]。minepy提供了MIC功能。

from minepy import MINE  #
m = MINE()
x = np.random.uniform(-1, 1, 10000)
m.compute_score(x, x**2)
print m.mic()


#1.3 距离相关系数 (Distance correlation)，[0,1]
#距离相关系数是为了克服Pearson相关系数的弱点而生的。在x和x^2这个例子中，即便Pearson相关系数是0，
#我们也不能断定这两个变量是独立的（有可能是非线性相关）；但如果距离相关系数是0，那么我们就可以说这两个变量是独立的。
import numpy as np

def dist(x, y):
    #1d only
    return np.abs(x[:, None] - y)

コード例 #56

0

ファイルを表示

ファイル: Continuous_Continuous.py プロジェクト: NabilBis/Features-selection-data-science-ML-

 def maximal_information_coefficient(self):
     mine = MINE()
     mine.compute_score(self.var1, self.var2)
     m = mine.mic()
     return m

コード例 #57

0

ファイルを表示

ファイル: QuestionServer.py プロジェクト: isluoshuang/CSS-Platform

def MICvalue():
    choice_user = request.get_json()  # 获取前端用户选择的数据
    flag = True
    # data = request.get_json() #bytes
    # print(data)
    choice0 = {}
    choice1 = {}
    # choice[0]['db'] = data[0][db]
    # choice[0]['col'] = data[0][col]
    # choice[0]['field'] = data[0][field]
    # choice[1]['db'] = data[1][db]
    # choice[1]['col'] = data[1][col]
    # choice[1]['field'] = data[1][field]
    choice0['db'] = choice_user[0][0]
    choice0['col'] = choice_user[0][1]
    choice0['field'] = choice_user[0][2]
    choice1['db'] = choice_user[1][0]
    choice1['col'] = choice_user[1][1]
    choice1['field'] = choice_user[1][2]
    print("choice0", choice0)
    print("choice1", choice1)
    # choice0['db'] = 'EpidemicData'
    # choice0['col'] = '上海'
    # choice0['field'] = '新增确诊'
    # choice1['db'] = 'EpidemicData'
    # choice1['col'] = '河北'
    # choice1['field'] = '新增确诊'
    # print(choice0)
    # print(choice1)

    # 获取数据
    # client = MongoClient("10.72.100.5",8027,username='******',password='******')
    client = MongoClient("10.72.100.5", 8027)
    db = client.admin
    db.authenticate("double", "double")
    conn = MongoClient(host='mongodb://10.72.100.5:8027/' + 'admin',
                       username='******',
                       password='******')
    database = conn[choice0['db']]
    collection0 = database[choice0['col']]
    results0 = collection0.find({}, {
        choice0['field']: 1,
        "_id": 0
    }).sort("_id", pymongo.ASCENDING)  # 按照_id排序
    collection1 = database[choice1['col']]
    results1 = collection1.find({}, {
        choice1['field']: 1,
        "_id": 0
    }).sort("_id", pymongo.ASCENDING)  # 按照_id排序
    # 1表示显示此字段，0表示不显示此字段，默认会显示_id
    rawdata0 = []
    rawdata1 = []
    for result in results0:
        rawdata0.append(result[choice0['field']])
    for result in results1:
        rawdata1.append(result[choice1['field']])

    # 清理数据
    for i in range(len(rawdata0) - 1, -1, -1):  # 假定rawdata0与rawdata1的长度相同
        if rawdata0[i] and rawdata1[i]:
            try:  # 将数字形式的数据转换为浮点数
                rawdata0[i] = float(rawdata0[i])
                rawdata1[i] = float(rawdata1[i])
            except ValueError:
                flag = False  # 存在非数值字段
        else:
            del rawdata0[i]
            del rawdata1[i]

    print("rawdata0", rawdata0)
    print("rawdata1", rawdata1)
    # 计算MIC
    m = MINE()
    if rawdata0:  # 当rawdata0与rawdata1不为空时
        if flag:
            # 将数据映射到[0,1]区间
            min_max_scaler = MinMaxScaler()
            data1_std = min_max_scaler.fit_transform(
                np.array(rawdata0).reshape(-1, 1))
            data2_std = min_max_scaler.fit_transform(
                np.array(rawdata1).reshape(-1, 1))
            data1 = data1_std.reshape(1, -1)[0]
            data2 = data2_std.reshape(1, -1)[0]
            m.compute_score(data1, data2)
            # str(m.mic())
            return json.dumps(m.mic())
        else:
            return "请选取数值字段"
    else:
        return "您所选取的两个字段无对应数据"

コード例 #58

0

ファイルを表示

 def mic(x, y):
     m = MINE()
     m.compute_score(x, y)
     return (m.mic(), 0.5)

コード例 #59

0

ファイルを表示

ファイル: feature_eval.py プロジェクト: cmangla/tptp-features

# %%
D = []
for theorem, strategy_results in strategy_evaluation_training_data.items():
    scores = [(1.0/strategy_results[str(i)][1]) if strategy_results[str(i)][0] else MINIMUM_SCORE for i in range(NUM_STRATEGIES)]
    k = tuple(theorem.split('/'))
    try:
        D.append((problem_features.loc[[k]].iloc[0].tolist(), scores))
    except Exception as e:
        print(e, type(e))
        pass

# %%
DFX = np.array([d[0] for d in D])
DFY = [np.array([d[1][i] for d in D]) for i in range(5)]

mine = MINE()
mics = []
for j in range(NUM_STRATEGIES):
    for i in range(DFX.shape[1]):
        mine.compute_score(DFX[:, i],DFY[j])
        mics.append((features[i], j, mine.mic()))
    #mics.append((features[i], mine.mic(), abs(pearsonr(DFX[:, i], DFY[0])[0])))


# %%
import csv
with open('data/scores.csv', mode='w') as f:
    cw = csv.writer(f)
    cw.writerow(['x', 'y', 'score'])
    for m in mics:
        cw.writerow([str(v) for v in m])

コード例 #60

-1

ファイルを表示

def performMIC(transposed_list):
    mic_scores=[]
    for counter1 in range(0, len(transposed_list)-1):
        for counter2 in range(counter1+1, len(transposed_list)):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(transposed_list[counter1], transposed_list[counter2])
            if (mine.mic()>0.6):
                mic_score={}
                mic_score['x']=counter1
                mic_score['y']=counter2
                mic_score['mic']=mine.mic()
                mic_scores.append(mic_score)
    return mic_scores