def chooseIndependantInputVariables(inArr):
	#print inArr
	selected_input_indexes = []
	for i in range(inArr.shape[1]):
		doSelect = True
		for j in range(i):

			#Subrata for now choosing all inputs! commentout "break" later when you need it.
			#break  # comment out this to select only independant inputs

			if(i == j):
				return
			x = inArr[:,i]
			y = inArr[:,j]
			#inputFeatureName1 = getInputParameterNameFromColumnIndex(i)
			inputFeatureName1 = getInputParameterNameFromFeatureIndex(i)
			#inputFeatureName2 = getInputParameterNameFromColumnIndex(j)
			inputFeatureName2 = getInputParameterNameFromFeatureIndex(j)
                	#print "x: ", x
                	x_scaled = preprocessing.scale(x)
                	y_scaled = preprocessing.scale(y)
                	#print "x: ", x_scaled
                	#print "targetArr: ", targetArr 
                	mine = MINE(alpha=0.6, c=15)
                	mine.compute_score(x_scaled, y_scaled)
			print "Correlation between ",inputFeatureName1,inputFeatureName2, " is ", mine.mic()  
			if(float(mine.mic()) >= 0.99):
				doSelect = False
				print "\n ***** ==> will NOT select ", inputFeatureName1, " as it correlates with ", inputFeatureName2, "\n" 
		#end for
		if(doSelect):
			selected_input_indexes.append(i)

	return selected_input_indexes
Exemple #2
0
def McOne(data, label, r):
    print("McOne start...")
    classLabel = label
    dataMat = data.values
    n = data.shape[0]
    micFC = [0] * n
    Subset = [-1] * n
    numSubset = 0
    for i in range(n):
        m = MINE()
        m.compute_score(dataMat[i], classLabel)
        micFC[i] = m.mic()
        if micFC[i] >= r:
            Subset[numSubset] = i
            numSubset += 1
    Subset = Subset[:numSubset]
    Subset.sort(key=lambda x: micFC[x], reverse=True)
    e = 0
    while e <= numSubset - 1:
        q = e + 1
        while q <= numSubset - 1:
            m = MINE()
            m.compute_score(dataMat[Subset[e]], dataMat[Subset[q]])
            if m.mic() >= micFC[Subset[q]]:
                for i in range(q, numSubset - 1):
                    Subset[i] = Subset[i + 1]
                numSubset -= 1
            else:
                q += 1
        e += 1
    return data.iloc[Subset[:numSubset]]
Exemple #3
0
def compute_MIC(x, y, alpha=0.6, c=15, all_metrics=False):
    from minepy import MINE
    mine = MINE(alpha, c)
    mine.compute_score(x, y)
    if all_metrics:
        return mine.mic(), mine
    else:
        return mine.mic()
Exemple #4
0
    def fit(self,X,y):
        # initialize phi and feature set
        # if number of features is not set, half of the features will be selected
        n = self.n
        beta = self.beta
        verbose = self.verbose
        if n ==None:
            n = int(X.shape[0]/2)

        features = np.arange(X.shape[1]).tolist()
        best_mi = -np.inf
        X_hat = 0
        for xi in features:
            m = MINE()
            m.compute_score(X[:,xi],y)
            #compute I(xi,y) and get max xi
            mi_xi_y = m.mic()
            if best_mi<mi_xi_y:
                X_hat = xi
        phi = [X_hat]
        features.remove(X_hat)
        # get paris for elements in phi and features
        while len(phi)<n:
            mi_scores = np.zeros(len(features))
            for xi_idx,xi in enumerate(features):
                m = MINE()
                m.compute_score(X[:,xi],y)
                #compute I(xi,y)
                mi_xi_y = m.mic()
                sum_mi_xi_xj = 0
                for xj in phi:
                    # compute I(xi,xj) and save for further evaluation
                    m = MINE()
                    m.compute_score(X[:,xi],X[:,xj])
                    mi_xi_xj = m.mic()
                    sum_mi_xi_xj+=mi_xi_xj
                mi_scores[xi_idx] = mi_xi_y - beta*sum_mi_xi_xj
                if verbose>=2:
                    print "mi_scores for xi:{xi}, xj:{xj} is {mi_scores}".format(xi=xi,xj=xj,mi_scores=mi_scores[xi_idx])

            X_hat = np.argmax(mi_scores)
            if verbose==1:
                print "X_hat is {X_hat}".format(X_hat=X_hat)
            X_hat = features[X_hat]
            phi.append(X_hat)
            features.remove(X_hat)
        self.phi = phi
        self.features = features
Exemple #5
0
def calculate_mic(df, y):
    max_info = MINE()
    mics ={}
    for column in df.columns:
        max_info.compute_score(df.loc[:, column], y.values)
        mics[column] = max_info.mic()
    return pd.Series(mics)
Exemple #6
0
def get_correlation(dataset, target, features=set([])):
    if target is None:
        raise ValueError('corr() need target value')
    if not isinstance(dataset, pd.DataFrame):
        dataset = pd.DataFrame(dataset)
    if not features:
        features = set(dataset.columns)
    numerical = {}
    text = {}
    num_types = (np.dtype('float64'), np.dtype('int64'), np.dtype('bool'))
    target = dataset[target]
    mine = MINE()
    for col in features:
        if dataset.dtypes[col] in num_types:
            if dataset.dtypes[col] is np.dtype('bool'):
                dataset[col] = dataset[col].astype(int, copy=False)
            mine.compute_score(dataset[col], target)
            numerical[col] = mine.mic()
        else:
            text[col] = np.nan
    return {
        'numerical':
        dict(sorted(numerical.items(), key=lambda d: d[1], reverse=True)),
        'object':
        dict(sorted(text.items(), key=lambda d: d[1], reverse=True))
    }
def _evaluate_single(data, target_feature):
    mine = MINE(alpha=0.4, c=15)
    MICs = list()
    for i in range(data.shape[1]):
        mine.compute_score(target_feature,data[:,i])
        MICs.append(mine.mic())
    return(MICs)
Exemple #8
0
def get_mic(x, y):
    #get maximum information coefficient and pearson r value
    r = np.corrcoef(x, y)[0, 1]
    mine = MINE(alpha=0.4, c=15, est='mic_e')
    mine.compute_score(x, y)
    mic = mine.mic()
    return mic, r
Exemple #9
0
def _evaluate_single(data, target_feature):
    mine = MINE(alpha=0.3, c=15)
    MICs = list()
    for i in range(data.shape[1]):
        mine.compute_score(target_feature,data[:,i])
        MICs.append(mine.mic())
    return(MICs)
Exemple #10
0
	def MIC_plot(self, x, y, numRows, numCols, plotNum, x_name, y_name, filename):
		# build the MIC and correlation plot using the covariant matrix using a vectorized implementation. To be used when
		# categorical features are part of the model (otherwise, Pearson, Kendall and Spearman can be used)
		print "Pearson product-moment correlation coefficients np.corrcoef(x=",x_name,", y=",y_name,"): ",np.corrcoef(x, y)
		r = np.around(np.corrcoef(x, y)[0, 1], 1)  # Pearson product-moment correlation coefficients.
		# TODO: compute cov matrix for each one-hot encoding variable of the categorical feature with
		# MINE's Mutual Information coefficient

		fig = plt.figure(figsize=(33,5), frameon=True)#, ms=50)
		mine = MINE(alpha=0.6, c=15, est="mic_approx")
		mine.compute_score(x, y)
		mic = np.around(mine.mic(), 1)
		ax = plt.subplot(numRows, numCols, plotNum)
		ax.set_xlim(xmin=min(x)+1, xmax=max(x)+1)
		ax.set_ylim(ymin=min(y)+1, ymax=max(y)+1)
		ax.set_title('Pearson r=%.1f\nMIC=%.1f Features %s and %s in %s' % (r, mic, x_name, y_name, filename),fontsize=10)
		ax.set_frame_on(False)
		ax.axes.get_xaxis().set_visible(True)
		ax.axes.get_yaxis().set_visible(True)
		ax.plot(x, y, '*')
		plt.xlabel('X')
		plt.ylabel('Y')
		# ax.set_xticks([])
		# ax.set_yticks([])
	#     plt.scatter(x,y,s=s)
	#     plt.show()
		return ax
Exemple #11
0
def calMIC(data):
    for i in range(5):
        mine = MINE(alpha=0.6, c=15)
        miles = data[data.veh == (i + 2)].iloc[:, 1]
        weight = data[data.veh == (i + 2)].iloc[:, 2]
        mine.compute_score(miles, weight)
        print("Without noise:", "MIC", mine.mic())
def mic(dataset: pd.DataFrame, labels: np.array) -> dict:
    score = {feature: None for feature in dataset}
    for feature, x in dataset.items():
        mine = MINE()
        mine.compute_score(x.values.ravel(), labels)
        score[feature] = mine.mic()
    return score
def calculateCorrelationBetweenVectors(x,y):
	#x = scipy.array([-0.65499887,  2.34644428, 3.0])
 	#y = scipy.array([-1.46049758,  3.86537321, 21.0])
	#The Pearson correlation coefficient measures the linear relationship between two datasets. 
	#Strictly speaking, Pearson correlation requires that each dataset be normally distributed. 
	#correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. 
	#Correlations of -1 or +1 imply an exact linear relationship. 

	#The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. 
	#The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so.
	#print "X = " , x, "\nY = ", y
	#corr, p_value = pearsonr(x, y)
	commonSize = 0
	if(len(x) < len(y)):
		commonSize = len(x)
	else:
		commonSize = len(y)
	x_sorted = np.sort(x)
	y_sorted = np.sort(y)
	
	x_sorted = x_sorted[ : (commonSize - 1)]
	y_sorted = y_sorted[ : (commonSize - 1)]
	
	x_scaled = preprocessing.scale(x_sorted)
	y_scaled = preprocessing.scale(y_sorted)

	mine = MINE(alpha=0.6, c=15)
        mine.compute_score(x_scaled, y_scaled)	
	corr = float(mine.mic())
	#return 
	#print "correlation :", corr
	return corr
Exemple #14
0
def performMIC(transposed_list, cutoff, p):
    mic_scores = []
    for counter1 in range(0, len(transposed_list) - 1):
        for counter2 in range(counter1 + 1, len(transposed_list)):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(transposed_list[counter1],
                               transposed_list[counter2])
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p + '_' + str(counter1 + 1)
                mic_score['y'] = p + '_' + str(counter2 + 1)
                mic_score['p1'] = p
                mic_score['p2'] = p
                mic_score['weight'] = format(mine.mic(), '.3f')
                mic_scores.append(mic_score)
    return mic_scores
def perform_mic_1p(p_sequences, p, cutoff=0.5, out_folder=''):
    p_sequences_t = transpose(array([list(z) for z in p_sequences])).tolist()
    mic_scores = []
    for counter1 in range(0, len(p_sequences_t) - 1):
        for counter2 in range(counter1 + 1, len(p_sequences_t)):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(p_sequences_t[counter1], p_sequences_t[counter2])
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p+'_'+str(counter1+1)
                mic_score['y'] = p+'_'+str(counter2+1)
                mic_score['p1'] = p
                mic_score['p2'] = p
                mic_score['weight'] = format(mine.mic(), '.3f')
                mic_scores.append(mic_score)
    write_mics_to_csv(mics=mic_scores, p1=p, p2=p, cutoff=cutoff, out_folder=out_folder)
    return mic_scores
Exemple #16
0
def feature_scoring(X, Y):
    names = ["x%s" % i for i in range(1, 37)]
    ranks = {}

    X = X.values[:, :]
    lr = LinearRegression(normalize=True)
    lr.fit(X, Y)
    ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

    ridge = Ridge(alpha=7)
    ridge.fit(X, Y)
    ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X, Y)
    ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

    #stop the search when 5 features are left (they will get equal scores)
    rfe = RFE(lr, n_features_to_select=5)
    rfe.fit(X, Y)
    ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

    rf = RandomForestRegressor()
    rf.fit(X, Y)
    ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

    f, pval = f_regression(X, Y, center=True)
    ranks["Corr."] = rank_to_dict(f, names)

    print('startMIC')
    mine = MINE()
    mic_scores = []

    for i in range(X.shape[1]):
        mine.compute_score(X[:, i], Y)
        m = mine.mic()
        mic_scores.append(m)
        print(i)
    ranks["MIC"] = rank_to_dict(mic_scores, names)

    print('finish MIc')

    r = {}
    for name in names:
        r[name] = round(
            np.mean([ranks[method][name] for method in ranks.keys()]), 2)
    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    methods.append("Mean")

    print("\t%s" % "\t".join(methods))
    for name in names:
        print("%s\t%s" % (name, "\t".join(
            map(str, [ranks[method][name] for method in methods]))))
def MIC(name):
    # pip install minepy
    fileN = db.searchFile(name)
    isfile = fileN[0]['filename']
    # print(fileN)
    if isfile == "dont_have_file":
        ret = {"route": "nofile"}
        return json.dumps(ret)

    if len(fileN) < 2:
        ret = {"route": '需要上传两个文件以进行MIC计算'}
        return json.dumps(ret)
    x = []
    y = []
    file1 = fileN[0]['filename']
    file2 = fileN[1]['filename']
    csvFile1 = open(name + '/' + file1, encoding='utf-8-sig')
    csv_file1 = csv.reader(csvFile1)
    for content in csv_file1:
        print(content)
        content = list(map(float, content))
        if len(content) != 0:
            x.append(float(content[0]))
    csvFile1.close()
    print('x=', x)
    csvFile2 = open(name + '/' + file2, encoding='utf-8-sig')
    csv_file2 = csv.reader(csvFile2)
    for content in csv_file2:
        content = list(map(float, content))
        if len(content) != 0:
            y.append(float(content[0]))
    csvFile2.close()
    print('y=', y)
    mine = MINE(alpha=0.6, c=15)
    mine.compute_score(x, y)
    print("MIC", mine.mic())
    #将MIC值写入文件
    with open(name + '/' + 'MIC_result.csv', 'w', newline='') as new_file:
        csv_writer = csv.writer(new_file)
        csv_writer.writerow(["MIC result"])
        data = []
        data.append(str(mine.mic()))
        csv_writer.writerow(data)
    ret = {"route": 'MIC_result.csv'}
    return json.dumps(ret)
Exemple #18
0
def mic(x, y):
    """
    :param x:
    :param y:
    :return:
    """
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5):
    mic_scores = []
    p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist()
    p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist()

    for idx1, record1 in enumerate(p1_sequences_t):
        for idx2, record2 in enumerate(p2_sequences_t):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(record1, record2)
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p1+'_'+str(idx1+1)
                mic_score['y'] = p2+'_'+str(idx2+1)
                mic_score['p1'] = p1
                mic_score['p2'] = p2
                mic_score['weight'] = format(mine.mic(), '.3f')
                mic_scores.append(mic_score)

    write_mics_to_csv(mics=mic_scores, p1=p1, p2=p2, cutoff=cutoff)
    return mic_scores
    def execute(self, symbol):
        """
        :param symbol: the symbol in which we are looking for correlations
        :type symbol: :class:`netzob.Common.Models.Vocabulary.AbstractField.AbstractField`
        """

        (attributeValues_headers, attributeValues) = self._generateAttributeValuesForSymbol(symbol)
        symbolResults = []

        # MINE computation of each field's combination
        for i, values_x in enumerate(attributeValues[:-1]):
            for j, values_y in enumerate(attributeValues[i + 1 :]):
                mine = MINE(alpha=0.6, c=15)
                mine.compute_score(numpy.array(values_x), numpy.array(values_y))
                mic = round(mine.mic(), 2)
                if mic > float(self.minMic):
                    # We add the relation to the results
                    (x_fields, x_attribute) = attributeValues_headers[i]
                    (y_fields, y_attribute) = attributeValues_headers[j]
                    # The relation should not apply on the same field
                    if len(x_fields) == 1 and len(y_fields) == 1 and x_fields[0].id == y_fields[0].id:
                        continue
                    pearson = numpy.corrcoef(values_x, values_y)[0, 1]
                    if not numpy.isnan(pearson):
                        pearson = round(pearson, 2)
                    relation_type = self._findRelationType(x_attribute, y_attribute)
                    self._debug_mine_stats(mine)
                    self._logger.debug(
                        "Correlation found between '"
                        + str(x_fields)
                        + ":"
                        + x_attribute
                        + "' and '"
                        + str(y_fields)
                        + ":"
                        + y_attribute
                        + "'"
                    )
                    self._logger.debug("  MIC score: " + str(mic))
                    self._logger.debug("  Pearson score: " + str(pearson))
                    id_relation = str(uuid.uuid4())
                    symbolResults.append(
                        {
                            "id": id_relation,
                            "relation_type": relation_type,
                            "x_fields": x_fields,
                            "x_attribute": x_attribute,
                            "y_fields": y_fields,
                            "y_attribute": y_attribute,
                            "mic": mic,
                            "pearson": pearson,
                        }
                    )
        return symbolResults
Exemple #21
0
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5):
    mic_scores = []
    p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist()
    p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist()

    for idx1, record1 in enumerate(p1_sequences_t):
        for idx2, record2 in enumerate(p2_sequences_t):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(record1, record2)
            if (mine.mic() > float(cutoff)):
                mic_score = {}
                mic_score['x'] = p1+'_'+str(idx1+1)
                mic_score['y'] = p2+'_'+str(idx2+1)
                mic_score['p1'] = p1
                mic_score['p2'] = p2
                mic_score['weight'] = mine.mic()
                mic_scores.append(mic_score)

    #print('computed ', len(mic_scores), ' mics for ', p1, p2, 'for cutoff ', cutoff)
    return mic_scores
Exemple #22
0
def mine_features(data,features):
    print '...'
    for X_hat_idx in features:
        features.remove(X_hat_idx)
        subset =  features
        for xi_idx in subset:
            m = MINE()
            X_hat = data[X_hat_idx].values
            xi = data[xi_idx].values
            m.compute_score(X_hat,xi)
            I_X_hat_xi = m.mic()
            if I_X_hat_xi>0.10:
                print 'I({X_hat_idx},{xi_idx}): {I_X_hat_xi}'.format(X_hat_idx=X_hat_idx,xi_idx=xi_idx,I_X_hat_xi=I_X_hat_xi)         
def calcMICReg(df,target,col):
    """
    
    """
    m=MINE()
    if df[col].dtype.name=="category":
        g=df.groupby(by=[col])['_target_variable_'].mean()
        g=g.to_dict()
        X=df[col].values
        X=[g[x] for x in X]    
    else:
        X=df[col].values
    m.compute_score(X, target)
    
    return {col:m.mic()} 
Exemple #24
0
def mysubplot(x, y, numRows, numCols, plotNum,
              xlim=(-4, 4), ylim=(-4, 4)):

    r = np.around(np.corrcoef(x, y)[0, 1], 1)
    mine = MINE(alpha=0.6, c=15)
    mine.compute_score(x, y)
    mic = np.around(mine.mic(), 1)
    ax = plt.subplot(numRows, numCols, plotNum,
                     xlim=xlim, ylim=ylim)
    ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.plot(x, y, ',')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax
Exemple #25
0
  def select_feature(self, data, label, threshold=0.7):
    """
    Perform feature selection by maximum information coefficient that can capture both linear and non-linear relationships.
    """
    selected = []

    from minepy import MINE
    mine = MINE()

    for i, col in enumerate(data):
      print 'feature selection: %d/%d %s' % (i, data.shape[1], col)
      mine.compute_score(data[col], label)
      if mine.mic() > threshold:
        selected.append(col)

    print '%d out of %d features were selected' % (len(selected), data.shape[1])

    return selected
def get_corrcoef(X):
    div = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.05, random_state=0)
    for train, test in div:
        X = X[np.array(test)]
        break

    X = X.transpose()
    pcc = np.ones((X.shape[0], X.shape[0]))
    m = MINE()
    # feat_groups = [[0], [1, 2, 3], [4, 5, 7, 8, 9, 10], [6],
    #                list(range(11, 24)), list(range(24, 29)), list(range(29, 34))]
    t = time()
    for i in range(0, 1):
        for j in range(1, 20):
            m.compute_score(X[i], X[j])
            pcc[i, j] = pcc[j, i] = m.mic()  # np.corrcoef(X[i], X[j])[0, 1]
            print(i, j, pcc[i, j], time()-t)
    np.savetxt(os.path.join(CODE_PATH, 'feat_sim_pcc_2.csv'), pcc, fmt='%.3f', delimiter=',')
    print('Done with computing PCC,', 'using', time()-t, 's')
Exemple #27
0
    def mutual_information(self, X, Y, title=None, nbins_X=50, nbins_Y=50,
            noise_sigma='all'):
        #import pdb; pdb.set_trace()
        no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y)))
        Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X)
        Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y)
        s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X))
        s.calculate_entropies()

        # MINE
        mine = MINE()
        mine.compute_score(X.flatten(), Y.flatten())

        # Linear regression
        slope, intercept, r, p, stderr = \
                scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx])

        #import pdb; pdb.set_trace()
        if title is not None:
            print(title)
        print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" %
                (noise_sigma, mine.mic(), s.I(), r**2, p, slope))
print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))

#明显缺陷:作为特征排序机制,他只对线性关系敏感.即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0
a = np.random.uniform(-1, 1, 100000)   #uniform(low,high,size) 随机数
print pearsonr(a, a**2)[0]


#1.2 互信息和最大信息系数 (Mutual information and maximal information),[0,1]
#互信息直接用于特征选择不太方便,最大信息系数首先寻找一种最优的离散化方式,
#然后把互信息取值转换成一种度量方式,取值区间在[0,1]。minepy提供了MIC功能。

from minepy import MINE  #
m = MINE()
x = np.random.uniform(-1, 1, 10000)
m.compute_score(x, x**2)
print m.mic()


#1.3 距离相关系数 (Distance correlation),[0,1]
#距离相关系数是为了克服Pearson相关系数的弱点而生的。在x和x^2这个例子中,即便Pearson相关系数是0,
#我们也不能断定这两个变量是独立的(有可能是非线性相关);但如果距离相关系数是0,那么我们就可以说这两个变量是独立的。
import numpy as np

def dist(x, y):
    #1d only
    return np.abs(x[:, None] - y)
    

def d_n(x):
    d = dist(x, x)
    dn = d - d.mean(0) - d.mean(1)[:,None] + d.mean()
Exemple #29
0
        mic_approx_null, mic_e_null, tic_e_null, r2_null = [], [], [], []
        mic_approx_alt, mic_e_alt, tic_e_alt, r2_alt = [], [], [], []

        # null hypothesis
        for k in range(1, n_null+1):
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()

            # resimulate x for the null scenario
            x = np.random.rand(n)

            mine_approx.compute_score(x, y)
            mine_e.compute_score(x, y)

            mic_approx_null.append(mine_approx.mic())
            mic_e_null.append(mine_e.mic())
            tic_e_null.append(mine_e.tic())
            r2_null.append(np.corrcoef(x, y)[0][1]**2)

        # alternative hypothesis
        for k in range(1, n_alt+1):
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()

            mine_approx.compute_score(x, y)
            mine_e.compute_score(x, y)

            mic_approx_alt.append(mine_approx.mic())
            mic_e_alt.append(mine_e.mic())
Exemple #30
0
def train_and_analyse(_X, _y, sno, ino):
	X = _X.copy()
	Y = _y
	features = X.columns.values
	cv_l = cross_validation.KFold(X.shape[0], n_folds=5,
								shuffle=True, random_state=1)
	ranks_linear = {}
	ranks_nonlinear= {}
	ranks_path = {}
	ranks = {}

	selection_feature = []

	time_feature_1 = [
					'date2j'
					]
	time_feature_2 = [
					'day',
					'month',
					'year'
					]

	time_feature_3 = [
					'is_2012', 
					'is_2013', 
					'is_2014',
					'fall', 
					'winter', 
					'spring',
					'summer'
					]

	time_feature_4 = [
					'weekday',
					'is_weekend', 
					'is_holiday', 
					'is_holiday_weekday', 
					'is_holiday_weekend',
					]

	time_feature_5 = [
					'MemorialDay', 
					'MothersDay', 
					'BlackFridayM3',
					'BlackFriday1', 
					'NewYearsDay', 
					'IndependenceDay', 
					'VeteransDay',
					'BlackFriday2', 
					'NewYearsEve', 
					'BlackFriday3', 
					'ChristmasDay',
					'BlackFridayM2', 
					'ThanksgivingDay', 
					'Halloween', 
					'EasterSunday',
					'ChristmasEve', 
					'ValentinesDay', 
					'PresidentsDay', 
					'ColumbusDay',
					'MartinLutherKingDay', 
					'LaborDay', 
					'FathersDay', 
					'BlackFriday'
					]

	weather_feature =  [
					'high_precip', 
					'preciptotal', 
					'snowfall', 
					'high_snow',
					'avgspeed', 
					'windy', 
					'temp_missing', 
					'tavg', 
					'hot', 
					'cold', 
					'frigid',
					'thunder', 
					'snowcode', 
					'raincode'
					]
	temp = time_feature_1 + time_feature_2 + time_feature_3 + time_feature_4 + time_feature_5
	X_f1 = X[temp].values
	# lr = LinearRegression(normalize=True)
	# lr.fit(X, Y)
	# ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	
	f, pval  = f_regression(ut.get_processed_X_A(X_f1), Y, center=True)
	ranks["F_regr"] = pd.Series(rank_to_dict(np.nan_to_num(f), temp))
	# print('asd')
	# mi = mutual_info_regression(ut.get_processed_X_A(X_f1), Y)
	# mi /= np.max(mi)
	# ranks['MI'] = Pd.Series()

	mine = MINE()
	mic_scores = []
	for i in range(ut.get_processed_X_A(X_f1).shape[1]):
	   mine.compute_score(ut.get_processed_X_A(X_f1)[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = pd.Series(rank_to_dict(mic_scores, temp))
	


	# ridge.fit(X, Y)
	# ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
	# to avoid exploring the regime in which very noisy variables enter
	# the model
	# rlasso = RandomizedLasso(alpha='bic', normalize=True)
	# rlasso.fit(X_f1, Y)
	# ranks_linear["Stability"] = pd.Series(rlasso.scores_)

	# alpha_grid, scores_path = lasso_stability_path(X_f1, Y, random_state=42,
 #                                                   eps=0.00005, n_grid=500)
	# for alpha, score in zip(alpha_grid, scores_path.T):
	# 	ranks_path[alpha] = score
	# ranks_path = pd.DataFrame(ranks_path).transpose()
	# ranks_path.columns = temp
	# plt.figure()
	# ranks_path.plot()
	# plt.show()
	# selection_feature.extend(ranks_linear[ranks_linear.F_regr > 0.1].index.values.tolist())
	# selection_feature.extend(ranks_linear[ranks_linear.MIC > 0.1].index.values.tolist())
	# selection_feature.extend(ranks_linear[ranks_linear.Stability > 0.1].index.values.tolist())
#-------------------------------

	# rf = RandomForestRegressor(n_estimators=150, max_depth=4, n_jobs=4, random_state=1)
	rf = ut.get_regression_model('RandomForest', 0)
	scores = []
	for i in range(X_f1.shape[1]):
	 score = cross_val_score(rf, X_f1[:, i:i+1].astype(float), Y, scoring="r2", cv=ShuffleSplit(len(X_f1), 3, .3), n_jobs=2)
	 scores.append(round(np.mean(score), 3))

	ranks['RF'] = pd.Series(rank_to_dict(np.abs(scores), temp)) 

	ranks = pd.DataFrame(ranks)
	print(ranks)
	selection_feature.extend(ranks[ranks.RF > 0.1].index.values.tolist())
	selection_feature.extend(ranks[ranks.MIC >= 0.1].index.values.tolist())
	selection_feature.extend(ranks[ranks.F_regr >= 0.1].index.values.tolist())
#-------------------------------
	selection_feature = list(set(selection_feature))
	print(selection_feature)
	# ridge = RidgeCV(cv=cv_l)
	# rfe = RFE(ridge, n_features_to_select=1)
	# rfe.fit(X[selection_feature],Y)
	# ranks["RFE"] = pd.Series(rank_to_dict(np.array(rfe.ranking_).astype(float), selection_feature, order=1))
	# ranks = pd.DataFrame(ranks)
	# print(ranks)
	# r = {}
	# for name in features:
	#     r[name] = round(np.mean([ranks[method][name] 
	#                              for method in ranks.keys()]), 2)
	 
	# methods = sorted(ranks.keys())
	# ranks["Mean"] = r
	# methods.append("Mean")

	path = 'Analyse/store_{}/'.format(sno)
	mkdir_p(path)
	path += 'item_{}_(pair_analyse)'.format(ino)
	ranks.to_pickle(path)

	path += '.png'
	p.clf()
	p.cla()
	plt.figure(figsize=(16, 26))
	ranks.plot.barh(stacked=True)
	p.savefig(path, bbox_inches='tight', dpi=300)
	plt.close()

	return ranks, selection_feature
def mic(x, y):
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)
 def interactionV(self, data):
     from minepy import MINE
     m = MINE()
     m.compute_score(data, x**2)
     print(m.mic())
Exemple #33
0
#     mine.compute_score(X_Standard_T[i], X_Standard_T[10])
#     mics.append(mine.mic())
#     print i, mine.mic()
# # for i in range(0,38):
# #     mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[38])
# #     mics.append(mine.mic())
# #     print i, mine.mic()
# for i in range(0,7):
#     mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[7])
#     mics.append(mine.mic())
#     print i, mine.mic()
#

for i in range(48):
    mine.compute_score(X_ALL_Standard_T[i], X_ALL_Standard_T[48])
    mics.append(mine.mic())

names = []
for c in allDF.columns.values: names.append(c)

map = {}
for i in range(48):
    map[names[i]] = mics[i]

import operator
sorted_tuple = sorted(map.items(), key=operator.itemgetter(1))

vs = []
ks = []
for k,v in sorted_tuple:
    ks.append(k); vs.append(v)
Exemple #34
0
class TestFunctions(unittest.TestCase):

    def setUp(self):
        self.mine = MINE(alpha=0.6, c=15)

    def build_const(self, n):
        x = np.linspace(0, 1, n)
        y = np.zeros(n)
        return x, y

    def build_linear(self, n):
        x = np.linspace(0, 1, n)
        return x, x

    def build_sine(self, n):
        x = np.linspace(0, 1, n)
        return x, np.sin(8*np.pi*x)

    def build_exp(self, n):
        x = np.linspace(0, 10, n)
        return x, 2**x

    def test_const(self):
        x, y = self.build_const(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 0., 4)
        assert_almost_equal(self.mine.mas(), 0., 4)
        assert_almost_equal(self.mine.mev(), 0., 4)
        assert_almost_equal(self.mine.mcn(), 2., 4)
        assert_almost_equal(self.mine.mcn_general(), 2., 4)

    def test_linear(self):
        x, y = self.build_linear(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 1., 4)
        assert_almost_equal(self.mine.mas(), 0., 4)
        assert_almost_equal(self.mine.mev(), 1., 4)
        assert_almost_equal(self.mine.mcn(), 2., 4)
        assert_almost_equal(self.mine.mcn_general(), 2., 4)

    def test_linear(self):
        x, y = self.build_linear(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 1., 4)
        assert_almost_equal(self.mine.mas(), 0., 4)
        assert_almost_equal(self.mine.mev(), 1., 4)
        assert_almost_equal(self.mine.mcn(), 2., 4)
        assert_almost_equal(self.mine.mcn_general(), 2., 4)

    def test_sine(self):
        x, y = self.build_sine(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 1., 4)
        assert_almost_equal(self.mine.mas(), 0.875, 3)
        assert_almost_equal(self.mine.mev(), 1., 4)
        assert_almost_equal(self.mine.mcn(), 4., 4)
        assert_almost_equal(self.mine.mcn_general(), 4., 4)

    def test_exp(self):
        x, y = self.build_exp(1000)
        self.mine.compute_score(x, y)
        assert_almost_equal(self.mine.mic(), 1., 4)
        assert_almost_equal(self.mine.mas(), 0., 4)
        assert_almost_equal(self.mine.mev(), 1., 4)
        assert_almost_equal(self.mine.mcn(), 2., 4)
        assert_almost_equal(self.mine.mcn_general(), 2., 4)
Exemple #35
0
Mdim=len(f1)
Mat=np.zeros((Mdim, Mdim))

# =============================================================================
#
#                    Compute MIC, PCC, KTau, NMIS Algorithm
#                      & Generate Correlation Matrix
#
# =============================================================================
print 'Computing mutual information indices and generating correlation matrix...'
for i in range(Mdim):
    for j in range(Mdim):
        if mla == 'MIC':
            mine.compute_score(f1[i],f2[j])
            Mat[i][j] = mine.mic()
        elif mla == 'PCC':
            Mat[i][j] = pearsonr(f1[i],f2[j])[0]
        elif mla == 'KTau':
            Mat[i][j] = kendalltau(f1[i], f2[j])[0]
        elif mla == 'NMIS':
            Mat[i][j] = normalized_mutual_info_score(f1[i],f2[j])
        sys.stdout.write(".")
        g=open(output_dir+'/'+'CorrMatrix_'+mla+'_'+str(Mdim)+'_'+str(GPS)+'_'+nfilename+'.txt','a')
        if j==Mdim-1:
            g.write(str(Mat[i][j]))
            g.write('\n')
        else:
            g.write(str(Mat[i][j]))
            g.write(' ')
        g.close()
Exemple #36
0
def doMICAnalysisOfInputVariables(inArr, targetArr,targetName, mic_score_threshold,input_indexes_uncorrelated_features,targetQualityMap = None):
	#if(targetQuality == None):
	#	return inArr
	#print inArr
	#global inputColumnNameToIndexMapFromFile
        #global measuredColumnNameToIndexMapFromFile
        #global outputColumnNameToIndexMapFromFile

 	#print "\n\n\n doMICAnalysisOfInputVariables called \n\n"
	
	goodTargetMap = getGlobalObject("goodTargetMap")

	selected_inArr = []
	selected_inArr_indexes = []
	selected_originalColumn_indexes = []

	inColMap = getGlobalObject("inputColumnIndexToNameMapFromFile") #keys are col index and vals are names
	#selected_inArr.append([])
	#print "doMICAnalysisOfInputVariables: ", "inArr.shape: ", inArr.shape
	#print "doMICAnalysisOfInputVariables: ", "targetArr.shape: ", targetArr.shape

	numOfFeatures = 0
	try:
		#(rows,numOfFeatures) = inArr.shape
		numOfFeatures = inArr.shape[1]
	except:
		print "ERROR: \n", inArr
		exit(0)
	k = 0	
	for featureIndex in range(numOfFeatures):
	#for i in inColMap.keys():
		#x = inArr[:,i]
		#x = inArr[:,k]
		# we will choose only uncorrelated features as input
		if(featureIndex not in input_indexes_uncorrelated_features):
			continue

		x = inArr[:,featureIndex]
		#print "x: ", x
		x_scaled = preprocessing.scale(x)
		#print "x: ", x_scaled
		#print "targetArr: ", targetArr 
		mine = MINE(alpha=0.6, c=15)
		mine.compute_score(x_scaled, targetArr)
		#print getGlobalObject("inputColumnNameToIndexMapFromFile")
		#inputFeatureName = getGlobalObject("inputColumnNameToIndexMapFromFile")[i]
		#inputFeatureName = inColMap[i]
		#inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex)
		#inputFeatureName = getInputParameterNameFromColumnIndex(featureIndex)
		inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex)
		print_stats(mine,inputFeatureName,targetName,mic_score_threshold)
		if(targetQualityMap != None):
			targetQualityMap.append(float(mine.mic()))
		#l = list(x)
		#selected_inArr = np.concatenate((selected_inArr, np.array(l)), axis=0)
		#print k
		#print mine.mic()
		if(float(mine.mic()) >= mic_score_threshold):
			selected_inArr.append(x) #keep the input data column
			selected_inArr_indexes.append(k) #keep the index corresponding to that column
			colIdx = getColumnIndexFromFeatureIndex(featureIndex)
			selected_originalColumn_indexes.append(colIdx) #keep the original column index corresponding to that column
			#now add the target itself to goodTargetMap. For anomaly detection we will only use these targets
			goodTargetMap[targetName] = True
			print "----------------- selected: ", inputFeatureName, colIdx, k
			k = k + 1	
		
	selected_inArr = np.array(selected_inArr).transpose()
	#print "\n **** selected: ==== \n", selected_inArr, selected_inArr_indexes,selected_originalColumn_indexes
        return selected_inArr, selected_inArr_indexes, selected_originalColumn_indexes
#RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

#f_regression
f, pval  = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)

#MINE
mine = MINE()
mic_scores = []
for i in range(X.shape[1]):
    mine.compute_score(X[:,i], Y)
    m = mine.mic()
    mic_scores.append(m)

ranks["MIC"] = rank_to_dict(mic_scores, names)

#----statistics--out---------
r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)

methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")

print "\t%s" % "\t".join(methods)
def mic(x, y):
	m = MINE()
	print x
	print y
	m.compute_score(x, y)
	return (m.mic(), 0.5)
def train_and_analyse(_X, _y, features):
	X = _X
	Y = _y
	cv_l = cross_validation.KFold(X.shape[0], n_folds=10,
								shuffle=True, random_state=1)
	ranks = {}

	lr = LinearRegression(normalize=True)
	lr.fit(X, Y)
	ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	

	ridge = RidgeCV(cv=cv_l)
	ridge.fit(X, Y)
	ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
	lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000)
	lasso.fit(X, Y)
	ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features)
	
	rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42)
	rlasso.fit(X, Y)
	ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features)
	
	rfe = RFE(lr, n_features_to_select=1)
	rfe.fit(X,Y)
	ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1)

	rf = RandomForestRegressor(n_estimators=500)
	rf.fit(X,Y)
	ranks["RF"] = rank_to_dict(rf.feature_importances_, features)

	f, pval  = f_regression(X, Y, center=True)
	ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features)

	mine = MINE()
	mic_scores = []
	for i in range(X.shape[1]):
	   mine.compute_score(X[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = rank_to_dict(mic_scores, features) 

	r = {}
	for name in features:
	    r[name] = round(np.mean([ranks[method][name] 
	                             for method in ranks.keys()]), 2)
	 
	methods = sorted(ranks.keys())
	ranks["Mean"] = r
	methods.append("Mean")
	
	ranks = pd.DataFrame(ranks)

	selection_feature = ranks[ranks.Mean > 0.12].index.values

	return ranks, selection_feature
res.append(pd.read_csv("./avg_xgbs_discret_feature_5.csv").score.values)
res.append(pd.read_csv("./R_7199.csv").score.values)
res.append(pd.read_csv("./rank_feature_xgb_ensemble.csv").score.values)
res.append(pd.read_csv("./avg_xgbs_discret_feature_10.csv").score.values)
res.append(pd.read_csv("./based_on_select_rank_feature.csv").score.values)
res.append(pd.read_csv("./xgb717.csv").score.values)
res.append(pd.read_csv("./725.csv").score.values)
res.append(pd.read_csv("./svm6938.csv").score.values)

cm = []
for i in range(8):
    tmp = []
    for j in range(8):
        m = MINE()
        m.compute_score(res[i], res[j])
        tmp.append(m.mic())
    cm.append(tmp)


import numpy as np
import matplotlib.pyplot as plt


def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(8)
    plt.xticks(tick_marks, fs, rotation=45)
    plt.yticks(tick_marks, fs)
    plt.tight_layout()
Exemple #41
0
    for j, f in enumerate(ff):
        mic_null, gmic_null, r2_null = [], [], []
        mic_alt, gmic_alt, r2_alt = [], [], []

        # null hypothesis
        for k in range(1, n_null+1):
            print i, j, k
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()
            
            # resimulate x for the null scenario
            x = np.random.rand(n)

            mine.compute_score(x, y)
            mic_null.append(mine.mic())
            gmic_null.append(mine.gmic(p=-1))
            r2_null.append(np.corrcoef(x, y)[0][1]**2)

        # alternative hypothesis
        for k in range(1, n_alt+1):
            x = np.random.rand(n)
            r = np.random.randn(n)
            y = f()

            mine.compute_score(x, y)
            mic_alt.append(mine.mic())
            gmic_alt.append(mine.gmic(p=-1))
            r2_alt.append(np.corrcoef(x, y)[0][1]**2)

        cut_mic = np.percentile(mic_null, 95)
 def get_mic(self):
     m = MINE()
     m.compute_score(self.x, self.y)
     return m.mic()
Exemple #43
-1
def performMIC(transposed_list):
    mic_scores=[]
    for counter1 in range(0, len(transposed_list)-1):
        for counter2 in range(counter1+1, len(transposed_list)):
            mine = MINE(alpha=0.6, c=15)
            mine.compute_score(transposed_list[counter1], transposed_list[counter2])
            if (mine.mic()>0.6):
                mic_score={}
                mic_score['x']=counter1
                mic_score['y']=counter2
                mic_score['mic']=mine.mic()
                mic_scores.append(mic_score)
    return mic_scores