Example #1
0
def select_using_MI(features, labels, threshold=.1, ham_label=0):
    '''
    Returns indices of the most salient features for the ham class, using a
    mutual information score between feature values and class label, and
    from the highest scoring, filtering the ones that are most present
    in spam relatively. This makes sense since we then use these indices
    to choose which features to turn on in emails
    '''
    X, Y = features, np.ravel(labels)
    N, D = X.shape
    d = int(D * threshold) ## percentile of features to keep

    ## calculate frequency of feature presence relative to each class
    ham_freq  = np.mean(X[np.ravel(Y == ham_label)], axis=0)
    spam_freq = np.mean(X[np.ravel(Y != ham_label)], axis=0)

    ## calculate mutual information between features and labels
    MI_per_feature = (mutual_info_score(X[:, f], Y) for f in range(D))
    MI_per_feature = np.fromiter(MI_per_feature, dtype=np.float16)

    ## keep only salient features for ham (according to relative presence in that class)
    MIs = MI_per_feature[ham_freq > spam_freq]
    salient_indices = np.argpartition(MIs, -d)[-d:]
    ## ^ https://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array/20177786#20177786

    return salient_indices
def mutualInformation_1to1(x, y, bins):
    """Computation of the mutual information between two time-series.

    Parameters
    ----------
    x: array_like, shape (N,)
        time series to compute difference.
    y: array_like, shape (N,)
        time series to compute difference.
    bins: int or tuple of ints or tuple of arrays
        the binning information.

    Returns
    -------
    mi: float
        the measure of mutual information between the two time series.

    """

    ## 1. Discretization
    if bins is None:
        # Compute contingency matrix
        pass
    else:
        c_xy = np.histogram2d(x, y, bins)[0]

    ## 2. Compute mutual information from contingency matrix
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi
Example #3
0
def ami(x, y=None, n_bins=10):
    """Calculate the average mutual information between $x(t)$ and $y(t)$.

    Parameters
    ----------
    x : array-like
    y : array-like, optional
        $x(t)$ and $y(t)$.
        If only `x` is passed, it must have two columns;
        the first column defines $x(t)$ and the second $y(t)$.
    n_bins : int
        The number of bins to use when computing the joint histogram.

    Returns
    -------
    scalar
        Average mutual information between $x(t)$ and $y(t)$, in nats (natural log equivalent of bits).

    See Also
    --------
    lagged_ami

    References
    ----------
    Arbanel, H. D. (1996). *Analysis of Observed Chaotic Data* (p. 28). New York: Springer.

    """
    x, y = _vector_pair(x, y)
    if x.shape[0] != y.shape[0]:
        raise ValueError('timeseries must have the same length')

    return metrics.mutual_info_score(None, None, contingency=np.histogram2d(x, y, bins=n_bins)[0])
Example #4
0
def calc_MI(x, y, bins):
    "Calculates the mutual information between two variables."
    # [0] gets the histogram, [1] & [2] are binning info.
    c_xy = np.histogram2d(x, y, bins)[0]
    # use scikit learn to calculate the binning information
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi
def __treemimic__(fittest, n_objects):
	fittest_count, last_drawn, medoid, counters, remaining, funcs = __mimic_setup__(fittest, n_objects)
	included = filter(lambda x: x not in remaining, xrange(n_objects))

	dependency_tree = {x: [] for x in xrange(n_objects)}

	compiled_mutual_information = np.ones((n_objects, n_objects), dtype=np.float)
	for i in xrange(n_objects):
		for j in xrange(0, i + 1):
			compiled_mutual_information[i, j] = mutual_info_score(medoid[i], medoid[j]) # entropy(medoid[i], counters[i], medoid[j], counters[j])
			compiled_mutual_information[j, i] = compiled_mutual_information[i, j]

	# ----------------------------------------------------------------------------------
	# ----------------------------------------------------------------------------------
	# ----------------------------------------------------------------------------------
	while len(remaining) > 0:
		smallest_entropy = np.inf
		for r in remaining:
			for i in included:
				some_entropy = compiled_mutual_information[r, i]
				if some_entropy < smallest_entropy:
					smallest_entropy = some_entropy
					drawn = r

		dependency_tree[last_drawn].append(drawn)

		included.append(drawn)

		last_drawn, remaining, funcs = __mimic_link__(fittest_count, last_drawn, drawn, counters, remaining, funcs)

	return dict(funcs), included, dependency_tree
Example #6
0
def mi(x, y, bins=10):
    """Mutual information between x and y"""
    H_x = u.compute_entropy(np.histogram(x, bins)[0])
    H_y = u.compute_entropy(np.histogram(y, bins)[0])
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = skm.mutual_info_score(None, None, contingency=c_xy)
    return mi / np.sqrt(H_x * H_y)
def mutual_information(x, y, bins=1000):
    """ Return the mutual information of the data in x, y
    """
    hist, xedges, yedges = np.histogram2d(x, y, bins=bins)
    #hist /= len(x)

    return mutual_info_score(None, None, contingency=hist)
Example #8
0
def MI_lag(  bins , x , y  , max_lag):
    
    #first need to ensure vectors are made the same length
    #always ensure that y is the wind vector x is the sand

    len_diff = len(y) - len(x)
    print len_diff

    if len_diff >0:
        y = y[:-len_diff]
    elif len_diff < 0:
        x = x[:len_diff]


    mi_vec = []
    for lag in range(max_lag):

        x_info = x[lag:]
        y_info = y[0: len(y) - lag]
        
        c_xy = np.histogram2d(x_info, y_info, bins)[0]
        mi_val = mutual_info_score(None, None, contingency=c_xy)



        mi_vec.append(mi_val)
        print lag , len(mi_vec)

    return np.array(mi_vec)
def __mimic__(fittest, n_objects):
	fittest_count, last_drawn, medoid, counters, remaining, funcs = __mimic_setup__(fittest, n_objects)

	dependency_tree = {x: [] for x in xrange(n_objects)}

	# ----------------------------------------------------------------------------------
	# ----------------------------------------------------------------------------------
	# ----------------------------------------------------------------------------------
	sampling_order = [last_drawn]
	while len(remaining) > 0:
		entropies = []
		for x in remaining:
			entropies += [mutual_info_score(medoid[x], medoid[last_drawn])]

		# entropies = map(
		# 	lambda x: entropy(medoid[x], counters[x], medoid[last_drawn], counters[last_drawn]),
		# 	remaining
		# )

		drawn = remaining[np.argmin(entropies)]
		sampling_order.append(drawn)

		dependency_tree[last_drawn].append(drawn)

		last_drawn, remaining, funcs = __mimic_link__(fittest_count, last_drawn, drawn, counters, remaining, funcs)

	return dict(funcs), sampling_order, dependency_tree
def NMI_by_max(labels,cluster_labels,label_H=None):
    if not label_H:
        label_H = label_entropy(labels)
    
    cluster_H = label_entropy(cluster_labels)
    
    MI = metrics.mutual_info_score(labels,cluster_labels)
    
    return MI/max(label_H,cluster_H)
def NMI_by_mean(labels,cluster_labels,label_H=None):
    if not label_H:
        label_H = label_entropy(labels)
    
    cluster_H = label_entropy(cluster_labels)
    
    MI = metrics.mutual_info_score(labels,cluster_labels)
    
    return 2.0*MI/(label_H + cluster_H)
Example #12
0
def bench_cluster(X, y, pca_n_comp):
    n = len(np.unique(y))
    pca = PCA(pca_n_comp)
    X_ = pca.fit_transform(X)
    sc = SpectralClustering(n)
    km = KMeans(n)
    sc_pred = sc.fit_predict(X_)
    km_pred = km.fit_predict(X_)
    distances = PairwiseDistances(X_.tolist())
    distances = ExplicitDistances(distances)
    singlel_pred = fcluster(linkage(ssd.squareform(distances.distances)), n, criterion='maxclust')
    print "single-linkage clustering prediction:", singlel_pred
    print "single-linkage clustering score:", adjusted_rand_score(y, singlel_pred), mutual_info_score(y, singlel_pred)
    print "spectral clustering prediction:", sc_pred
    print "spectral clustering score:", adjusted_rand_score(y, sc_pred), mutual_info_score(y, sc_pred)
    print "kmeans clustering prediction", km_pred
    print "kmeans clustering score:", adjusted_rand_score(y, km_pred), mutual_info_score(y, km_pred)
    print "ground truth labels", y
Example #13
0
def get_feature_mi(data, threshold):
    correlation_dict = defaultdict(list)
    for feature in data.columns:
        for other in data.columns:
            if other == feature:
                continue
            mi = metrics.mutual_info_score(data[feature].values, data[other].values)
            if mi >= threshold:
                correlation_dict[feature].append(other)
    return correlation_dict
Example #14
0
def chow_liu_tree(y_):
    n_labels = y_.shape[1]
    mi = np.zeros((n_labels, n_labels))
    for i in range(n_labels):
        for j in range(n_labels):
            mi[i, j] = mutual_info_score(y_[:, i], y_[:, j])
    mst = minimum_spanning_tree(csr_matrix(-mi))
    edges = np.vstack(mst.nonzero()).T
    edges.sort(axis=1)
    return edges
Example #15
0
	def mutual_information(self,max_lag):
		"""
		Calculates the mutual information between the an unshifted time series
		and a shifted time series. Utilizes scikit-learn's implementation of
		the mutual information found in sklearn.metrics.

		Parameters
		----------

		X : 1-D array
			time series that is to be shifted over

		max_lag : integer
			maximum amount to shift the time series

		Returns
		-------
		m_score : 1-D array
			mutual information at between the unshifted time series and the
			shifted time series
		"""

		#number of bins - say ~ 20 pts / bin for joint distribution
		#and that at least 4 bins are required
		N = max(self.X.shape)
		num_bins = max(4.,np.floor(np.sqrt(N/20)))
		num_bins = int(num_bins)

		m_score = np.zeros((max_lag))

		for jj in range(max_lag):
			lag = jj+1

			ts = self.X[0:-lag]
			ts_shift = self.X[lag:]

			min_ts = np.min(self.X)
			max_ts = np.max(self.X)+.0001 #needed to bin them up

			bins = np.linspace(min_ts,max_ts,num_bins+1)

			bin_tracker = np.zeros_like(ts)
			bin_tracker_shift = np.zeros_like(ts_shift)

			for ii in range(num_bins):

				locs = np.logical_and( ts>=bins[ii], ts<bins[ii+1] )
				bin_tracker[locs] = ii

				locs_shift = np.logical_and( ts_shift>=bins[ii], ts_shift<bins[ii+1] )
				bin_tracker_shift[locs_shift]=ii


			m_score[jj] = skmetrics.mutual_info_score(bin_tracker,bin_tracker_shift)
		return m_score
Example #16
0
def main():
    ## Synthetic Example
    clustering1 = {
        '1': {'1', '2', '3', '4'},
        '2': {'1', '2', '3', '4'},
        '3': {'1', '2', '3', '4'},
        '4': {'1', '2', '3', '4'},
        '5': {'5', '6', '7'},
        '6': {'5', '6', '7'},
        '7': {'5', '6', '7'},
    }
    clustering2 = {
        '1': {'1', '2', '3'},
        '2': {'1', '2', '3'},
        '3': {'1', '2', '3'},
        '4': {'4', '5', '6', '7'},
        '5': {'4', '5', '6', '7'},
        '6': {'4', '5', '6', '7'},
        '7': {'4', '5', '6', '7'},
    }
    labels1_dict = links_to_labels(clustering1)
    labels2_dict = links_to_labels(clustering2)
    clustering_combined = consensus_clustering([clustering1, clustering2])
    labels_combined_dict = clusters_to_labels(clustering_combined)
    labels1 = []
    labels2 = []
    labelsCombined = []
    for doc_id in range(1, len(labels1_dict) + 1):
        labels1.append(labels1_dict[str(doc_id)])
        labels2.append(labels2_dict[str(doc_id)])
        labelsCombined.append(labels_combined_dict[str(doc_id)])
    print('Mutual information between labels1 and labels2 is ' + str(mutual_info_score(labels1, labels2)))


    ## Real data
    print('Loading HT data')
    phone_clusters = pickle.load(open('Consensus/WadePresentation/phone_cluster.pkl', 'rb'))
    text_clusters = pickle.load(open('Consensus/WadePresentation/text_cluster.pkl', 'rb'))
    print('Converting to labels')
    phone_labels_dict = links_to_labels(phone_clusters)
    text_labels_dict = links_to_labels(text_clusters)
    phone_labels = []
    text_labels = []
    for doc_id in range(0, len(phone_labels_dict)):
        phone_labels.append(phone_labels_dict[str(doc_id)])
        text_labels.append(text_labels_dict[str(doc_id)])

    print('Running consensus clustering...')
    clustering_combined = consensus_clustering([phone_clusters, text_clusters])
    print('Number of consensus clusters: ' + str(len(clustering_combined)))
    nrecords = 0
    for cluster in clustering_combined:
        nrecords += len(cluster)
    print('Checking number of records: ' + str(nrecords))
Example #17
0
def clusterize(models):
    labels_true = [2, 0, 3, 2, 2, 0, 1, 2, 3]
    clusters = KMeans(n_clusters=4).fit_predict(models)
    rand = metrics.adjusted_rand_score(labels_true, clusters)
    mi = metrics.mutual_info_score(labels_true, clusters)
    h**o = metrics.homogeneity_score(labels_true, clusters)
    completeness = metrics.completeness_score(labels_true, clusters)

    scores_list = [rand, mi,h**o,completeness]
    scores_names = ['Приведенный индекс Ранда', 'Коэф. взаимной информации', 'Гомогенность', 'Полнота кластеров']
    scores =  pd.DataFrame(data=scores_list, index=scores_names, dtype=object)
    return clusters, scores
Example #18
0
    def _generate_mutual_information_graph(self):
        samples = np.asarray(self.samples)
        complete_graph = nx.complete_graph(samples.shape[1])

        for edge in complete_graph.edges():
            mutual_info = mutual_info_score(
                samples[:, edge[0]],
                samples[:, edge[1]]
            )

            complete_graph.edge[edge[0]][edge[1]]['weight'] = -mutual_info

        return complete_graph
Example #19
0
def chow_liu(X):
    """
    Chow-Liu structure learning algorithm.
    :param X: dataset
    :return: the learned graph (tree)
    """
    n_objects = X.shape[0]
    n_vars = X.shape[1]
    g = nx.complete_graph(n_vars)
    for i, j in g.edges():
        g.edge[i][j]['mutual information'] = mutual_info_score(X[:, i], X[:, j])
    g = maximum_spanning_tree(g, weight='mutual information')
    return g
Example #20
0
def get_score(node_set, data_dict, label):
	'''
	Calculate Mutual information score
	'''
	mat = []
	for each in node_set:
		mat.append(data_dict[each])
	mat = np.array(mat)
	mean_col = mat.mean(axis=0)
	count = int(np.log2(len(mean_col))+1)
	bins = np.histogram(mean_col,count)[1]
	index = np.digitize(mean_col,bins)
	values = [bins[i-1] for i in index]
	score = metrics.mutual_info_score(label,values)
	return score
Example #21
0
def get_score(node_set, data_dict, label):
    '''
	Calculate Mutual information score
	'''
    mat = []
    for each in node_set:
        mat.append(data_dict[each])
    mat = np.array(mat)
    mean_col = mat.mean(axis=0)
    count = int(np.log2(len(mean_col)) + 1)
    bins = np.histogram(mean_col, count)[1]
    index = np.digitize(mean_col, bins)
    values = [bins[i - 1] for i in index]
    score = metrics.mutual_info_score(label, values)
    return score
def calculateMI_Plane(x, t, y, binCount = 24):
    # Push T into bins
    # T_bins = np.zeros((len(t), binCount))
    T_Nums = np.zeros(len(t))
    X_Nums = np.zeros(len(x))
    Y_Nums = np.zeros(len(y))

    for i in range(len(t)):
        # T_bins[i] = np.histogram(t[i], bins=binCount, range=(-1.0, 1.0))[0] # throw out bin edges
        T_Nums[i] = array_to_number(np.histogram(t[i], bins=binCount, range=(-1.0, 1.0))[0])
        X_Nums[i] = array_to_number(x[i])
        Y_Nums[i] = array_to_number(y[i])
        # cprint("- X: %d" % X_Nums[i], 'green')
        # cprint("- T: %d" % T_Nums[i], 'yellow')
        # cprint("- Y: %d" % Y_Nums[i], 'red')

    # probXT = calculate_jointProb(X_Nums, T_Nums)
    # probTY = calculate_jointProb(T_Nums, Y_Nums)

    # cprint(probXT, 'yellow')
    # cprint(probTY, 'cyan')

    # return (mutual_info_score(None, None, contingency=probXT), mutual_info_score(None, None, contingency=probTY))
    return (mutual_info_score(X_Nums, T_Nums), mutual_info_score(T_Nums, Y_Nums))
def get_mutual_information(data, attributes, labels):
    totalMutInfo = 0.0
    bins = 9
    length = len(data[0])
    for j in range(length):
        if (is_numeric(attributes[j])):
            try:
                col = pd.cut(data[:, j], bins)
            except:
                col = data[:, j]
        else:
            col = data[:, j]
        mutInfo = mutual_info_score(col, labels)
        totalMutInfo = totalMutInfo + mutInfo
    return totalMutInfo / len(data[0])
Example #24
0
    def CalculateNormalizedMI(self, pathFile):
        listMI = []

        mi_df = pd.read_csv(pathFile)
        columns_name = mi_df.columns

        for att1 in columns_name:
            listAnMI = [att1]
            column1 = mi_df[att1].to_numpy()
            mi1 = metrics.mutual_info_score(column1, column1)
            if mi1 == 0.0:
                 mi1=1
            for att2 in columns_name:
                if att1 == att2:
                    nmi2 = 1.0
                else:
                    column2 = mi_df[att2].to_numpy()
                    mi2 = metrics.mutual_info_score(column1, column2)
                    nmi2 = mi2 / mi1
                listAnotherAtt = [att2, nmi2]
                listAnMI.append(listAnotherAtt)
            listMI.append(listAnMI)

        return listMI
def maxMI(labels,clust):
    max_MI = 0.0
    
    for i in range(clust.num_leaves):
        clusters = clust[i]
        cluster_labels = np.zeros(len(labels))
        
        for index,cluster_IDs in enumerate(clusters):
            cluster_labels[cluster_IDs] = index
            
        MI = metrics.mutual_info_score(labels,cluster_labels)
        if MI > max_MI:
            max_MI = MI
            
    return max_MI
Example #26
0
def test_discrete_mutual_info(cases: str) -> None:
    seed = 123456
    cases = int(cases)
    i = 1
    N_decimal = 4
    while i < cases:
        random_labels = np.random.RandomState(seed).randint
        n_samples = np.random.randint(1, 100)
        n_classes = np.random.randint(1, 10)
        labels_a = random_labels(low=0, high=n_classes, size=n_samples)
        labels_b = random_labels(low=0, high=n_classes, size=n_samples)
        mine_MI = discrete_mutual_info(labels_a, labels_b)
        gold_MI = mutual_info_score(labels_a, labels_b)
        assert_almost_equal(mine_MI, gold_MI, decimal=N_decimal)
        i += 1
Example #27
0
def mutual_information(x, y, nbins=32, normalized=False):
    """
    Compute mutual information

    :param x: 1D numpy.array : flatten data from an image
    :param y: 1D numpy.array : flatten data from an image
    :param nbins: number of bins to compute the contingency matrix (only used if normalized=False)
    :return: float non negative value : mutual information
    """
    if normalized:
        mi = normalized_mutual_info_score(x, y)
    else:
        c_xy = np.histogram2d(x, y, nbins)[0]
        mi = mutual_info_score(None, None, contingency=c_xy)
    return mi
Example #28
0
def compute_seq_mi_across_layers(layers_output, layer, bins=128):
    l = np.digitize(
        layers_output[layer],
        np.linspace(np.min(layers_output[layer]), np.max(layers_output[layer]),
                    bins))
    n = np.digitize(
        layers_output[layer + 1],
        np.linspace(np.min(layers_output[layer + 1]),
                    np.max(layers_output[layer + 1]), bins))
    layer_quantizied, layer_idx = np.unique(l, axis=0, return_inverse=True)
    next_quantizied, next_idx = np.unique(n, axis=0, return_inverse=True)
    return mutual_info_score(None,
                             None,
                             contingency=np.histogram2d(layer_idx,
                                                        next_idx)[0])
def calculate_extrinsic_metrics(dataset, real_classes, predicted_classes):
    confusion_matrix = matriz_confusion(real_classes, predicted_classes)

    return {
        'Error': medida_error(confusion_matrix),
        'Pureza': medida_pureza(confusion_matrix),
        'F1': medida_f1(confusion_matrix),
        'Entropía': medida_entropia(confusion_matrix),
        'Información mútua': metrics.mutual_info_score(real_classes, predicted_classes),
        'ARI': metrics.adjusted_rand_score(real_classes, predicted_classes),
        'Homogeneidad': metrics.homogeneity_score(real_classes, predicted_classes),
        'Completación': metrics.completeness_score(real_classes, predicted_classes),
        'Medida V': metrics.v_measure_score(real_classes, predicted_classes),
        'Fowlkes-Mallows': metrics.fowlkes_mallows_score(real_classes, predicted_classes),
        'Silhouette': metrics.silhouette_score(dataset, predicted_classes, metric='euclidean'),
        'Calinski-Harabasz': metrics.calinski_harabasz_score(dataset, predicted_classes),
        'Davies-Bouldin': davies_bouldin_score(dataset, predicted_classes),
        'media': (medida_pureza(confusion_matrix) + medida_f1(confusion_matrix) + metrics.mutual_info_score(
            real_classes, predicted_classes) + metrics.adjusted_rand_score(real_classes,
                                                                           predicted_classes) + metrics.homogeneity_score(
            real_classes, predicted_classes) + metrics.completeness_score(real_classes,
                                                                          predicted_classes) + metrics.v_measure_score(
            real_classes, predicted_classes) + metrics.fowlkes_mallows_score(real_classes, predicted_classes)) / 8
    }
Example #30
0
 def fit(self, X, y=None):
     self.num_feat_ = X.shape[1]
     g = nx.Graph()
     g.add_nodes_from(range(self.num_feat_))
     for i in range(self.num_feat_):
         for j in range(i + 1, self.num_feat_):
             c_xy = np.histogram2d(X[:, i], X[:, j], self.bins)[0]
             mi = mutual_info_score(None, None, contingency=c_xy)
             g.add_edge(i, j, weight=mi)
     t = nx.minimum_spanning_tree(g)
     self.edges_ = t.edges() # list of (i,j) tuples
     self.root_generator.fit(X[:, [0]])
     self.generators_ = [self.new_generator(X[:, [i]], X[:, [j]])
                         for i, j in self.edges_]
     return self
Example #31
0
 def check(self, frame_indexs: list):
     frame1, (x1, y1, w1, h1) = self.get_subtitle(frame_indexs[0])
     frame1 = frame1[y1:h1, x1:w1]
     frame2, (x2, y2, w2, h2) = self.get_subtitle(frame_indexs[1])
     frame2 = frame2[y1:h1, x1:w1]
     mutual_infor = mr.normalized_mutual_info_score(frame1.reshape(-1),
                                                    frame2.reshape(-1))
     print(mutual_infor)
     mutual_infor = mr.adjusted_mutual_info_score(np.reshape(frame1, -1),
                                                  np.reshape(frame2, -1))
     print(mutual_infor)
     mutual_infor = mr.mutual_info_score(np.reshape(frame1, -1),
                                         np.reshape(frame2, -1))
     print(mutual_infor)
     cv2.imshow(f"frames_{frame_indexs}", np.vstack((frame1, frame2)))
    def __init__(self, X, Y, method="ICAP"):
        """
        This class provides easy access to mutual information based filter feature selection.
        The default mutual information estimation algorithm used is the histogram binning method. If a more
        sophisticated approach is required, use the change_MI_estimator function to apply your own method.

        :param X: (n_samples, n_features) numpy array containing the training data
        :param Y: (n_samples) numpy array containing target labels
        :param method: filter criterion that will be applied to select the features. Available criteria are: (as string)
                       "CIFE" [Lin1996], "ICAP" [Jakulin2005], "CMIM" [Fleuret2004], "JMI"[Yang1999]
        """
        if X.shape[0] != len(Y):
            raise ValueError(
                "X must have as many samples as there are labels in Y")

        self._n_features = X.shape[1]

        def normalize_data_for_MI(X):
            for i in xrange(X.shape[1]):
                std = X[:, i].std()
                if std != 0.:
                    X[:, i] /= std
                    X[:, i] -= X[:, i].min()
            return np.floor(X).astype("int")

        self._X = normalize_data_for_MI(np.asarray(X))
        self._Y = np.asarray(Y)

        self._method_str = method
        self._methods = {
            "CIFE": self.__J_CIFE,
            "ICAP": self.__J_ICAP,
            "CMIM": self.__J_CMIM,
            "JMI": self.__J_JMI,
            "mRMR": self.__J_mRMR,
            "MIFS": self.__J_MIFS
        }
        self._filter_criterion_kwargs = {}
        self.change_method(method)
        self._method = self._methods[method]
        self._mutual_information_estimator = lambda X1, X2: mutual_info_score(
            X1, X2) / np.log(2.0)

        self._redundancy = np.zeros((self._n_features, self._n_features)) - 1.
        self._relevancy = np.zeros((self._n_features)) - 1
        self._class_cond_red = np.zeros(
            (self._n_features, self._n_features)) - 1
        self._class_cond_mi_method = self._calculate_class_conditional_MI
    def eval_node_probs(self):
        """Update probability density estimates.
        """
        # Create mutual info matrix
        mutual_info = np.zeros([self.length, self.length])
        for i in range(self.length - 1):
            for j in range(i + 1, self.length):
                # DEBUGGING CODE
                try:
                    mutual_info[i, j] = -1 * mutual_info_score(
                        self.keep_sample[:, i], self.keep_sample[:, j])
                except ValueError:
                    print(f'self.keep_sample[:, i] = {self.keep_sample[:, i]}')
                    print(f'self.keep_sample[:, j] = {self.keep_sample[:, j]}')
                    raise Exception("Caught value error")

        # Find minimum spanning tree of mutual info matrix
        mst = minimum_spanning_tree(csr_matrix(mutual_info))

        # Convert minimum spanning tree to depth first tree with node 0 as root
        dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False)
        dft = np.round(dft.toarray(), 10)

        # Determine parent of each node
        parent = np.argmin(dft[:, 1:], axis=0)

        # Get probs
        probs = np.zeros([self.length, self.max_val, self.max_val])

        probs[0, :] = np.histogram(self.keep_sample[:, 0],
                                   np.arange(self.max_val + 1),
                                   density=True)[0]

        for i in range(1, self.length):
            for j in range(self.max_val):
                subset = self.keep_sample[np.where(
                    self.keep_sample[:, parent[i - 1]] == j)[0]]

                if not len(subset):
                    probs[i, j] = 1 / self.max_val
                else:
                    probs[i, j] = np.histogram(subset[:, i],
                                               np.arange(self.max_val + 1),
                                               density=True)[0]

        # Update probs and parent
        self.node_probs = probs
        self.parent_nodes = parent
Example #34
0
    def fit(self, X, y=None):
        '''
			Compute k-means clustering.
			X is n_samples x n_features.
			y is n_samples.
		'''
        n_samples, n_features = X.shape
        # Randomly select K samples as initial centers
        perm = permutation(n_samples)[:self.n_clusters]
        C = X[perm]

        # Nearest neighbor function
        def nn(x):
            return argmin(norm(C - x, axis=1))

        # Iterate 50 times
        for i in range(50):
            # Assign points to nearest cluster centers
            NN = NearestNeighbors(n_neighbors=1, metric='euclidean').fit(C)
            dist, ind = NN.kneighbors(X)
            ind = ind.flatten()
            # ind = apply_along_axis(nn, 1, X)
            # Update the cluster centers
            for k in range(self.n_clusters):
                C[k] = mean(X[ind == k], axis=0)
        if self.debug:
            if isinstance(y, ndarray):
                print("Class membership counts:")
                print(unique(y, return_counts=True)[1])
            print("Cluster membership counts:")
            print(unique(ind, return_counts=True)[1])
        # Compute the WC_SSD

        self.WC_SSD = wc_ssd(X, C, ind)

        # Compute the SC
        self.SC = sc(X, C, ind)
        # Compute the NMI
        if isinstance(y, ndarray):
            self.NMI = nmi(y, ind)
        print("WC-SSD %.3f" % self.WC_SSD)
        print("SC %.3f" % self.SC)
        print("NMI %.3f" % self.NMI)
        if self.debug:
            print("sklearn SC %.3f" % silhouette_score(X, ind))
            if isinstance(y, ndarray):
                print("sklearn NMI %.3f" % mutual_info_score(y, ind))
        return ind
Example #35
0
def build_bayesian_network(df, k, epsilon):
    bn = []
    V = tuple()
    attributes = list(df.columns)
    x = np.random.choice(attributes)
    bn.append((x, list()))
    V += (x, )
    i = 1
    attributes.remove(x)
    n, d = df.shape
    S = (2 / n) * np.log2((n + 1) / 2) + (n - 1) / n * np.log2(
        (n + 1) / (n - 2))
    delta = (d - 1) * S / epsilon
    while attributes:
        if i <= k:
            parent_combos = [V]
            i += 1
        else:
            parent_combos = combinations(V, k)
        omega = dict()
        for x in attributes:
            for combo in parent_combos:
                parent_values = [
                    '\t'.join(map(str, tuple(row)))
                    for index, row in df[[x for x in combo]].iterrows()
                ]
                mi_score = mutual_info_score(parent_values, df[x])
                omega[(x, combo)] = mi_score
        omega_private = {
            key: np.exp(omega[key] / (2 * delta))
            for key in omega
        }
        total_omega_private = sum(omega_private.values())
        omega_private = {
            key: value / total_omega_private
            for key, value in omega_private.items()
        }
        keys = list(omega_private.keys())
        max_pair_index = np.random.choice(list(range(len(keys))),
                                          p=list(omega_private.values()))
        max_pair = keys[max_pair_index]
        #max_pair = max(omega, key=omega.get) #NOTE: NOT PRIVATE RIGHT NOW
        #print(max_pair)
        bn.append(max_pair)
        V += (max_pair[0], )
        #Need to remove whatever we added as we go
        attributes.remove(max_pair[0])
    return bn
Example #36
0
def get_metrics(test_image, true_image, top_k, total_size):
    """Computes multiple different metrics between two images.

  We compute a variety of metrics on the input image: we output L1 and L2
  distances, Wasserstein (earth movers) distance, hotspot count and f1 score for
  the provided TOP-K parameter, and an MSE error. For the correct comparison the
  images are scaled to the same size first,and then compared per coordinate.


  Args:
    test_image: obtained image to obtain the metrics
    true_image: original image to compare against the test_image.
    top_k: parameter to compute top-k hot spots.
    total_size: the size to scale the images to.

  Returns:
    l2 dist, hot spot counts, movers distance, f1-score, l1 dist, mutual info,
    MSE.
  """

    # normalize the input images
    test_image = normalize(rescale_image(test_image, total_size))
    true_image = normalize(rescale_image(true_image, total_size))

    top_k_test, top_k_test_arr = largest_indices(test_image, top_k)
    top_k_true, top_k_true_arr = largest_indices(true_image, top_k)

    l1_distance = np.linalg.norm(true_image - test_image, ord=1)
    l2_distance = np.linalg.norm(true_image - test_image, ord=2)

    mse = mt.mean_squared_error(test_image, true_image)
    top_k_diff = len(top_k_true.intersection(top_k_test))
    wasserstein = stats.wasserstein_distance(test_image.reshape(-1),
                                             true_image.reshape(-1))
    f1 = mt.f1_score(top_k_true_arr.reshape(-1), top_k_test_arr.reshape(-1))

    mutual = mt.mutual_info_score(true_image.reshape(-1),
                                  test_image.reshape(-1))

    metrics = Metrics(l1_distance=l1_distance,
                      l2_distance=l2_distance,
                      mse=mse,
                      f1=f1,
                      wasserstein=wasserstein,
                      hotspots_count=top_k_diff,
                      mutual_info=mutual)

    return metrics
Example #37
0
def variation_of_information(lab1, lab2):

    import sklearn.metrics as mt
    import numpy as np

    def entropy(lab):

        res = 0
        from collections import Counter
        repetitions = Counter(lab)
        for cluster in repetitions.keys():
            p = repetitions[cluster] / len(lab)
            res += -p * np.log(p)
        return res

    return entropy(lab1) + entropy(lab2) - 2 * mt.mutual_info_score(lab1, lab2)
Example #38
0
def mutual_information(x, y, nbins=32, normalized=False):
    """
    Compute mutual information
    :param x: 1D numpy.array : flatten data from an image
    :param y: 1D numpy.array : flatten data from an image
    :param nbins: number of bins to compute the contingency matrix (only used if normalized=False)
    :return: float non negative value : mutual information
    """
    from sklearn.metrics import normalized_mutual_info_score, mutual_info_score
    if normalized:
        mi = normalized_mutual_info_score(x, y)
    else:
        c_xy = np.histogram2d(x, y, nbins)[0]
        mi = mutual_info_score(None, None, contingency=c_xy)
    # mi = adjusted_mutual_info_score(None, None, contingency=c_xy)
    return mi
def calc_mutual_information(x, y, bins):
    try:
        if bins == -1:
            bins = doane_bin(x)
        if bins == np.inf:
            bins = sturges_bin(x)
    except ValueError:
        bins = 10.0
    # print "bins", bins
    try:
        c_xy = np.histogram2d(x, y, bins)[0]
        mi = metrics.mutual_info_score(None, None, contingency=c_xy)
        # print "success"
    except Exception,e: 
        print "error with mi calc", str(e)
        mi = 0
Example #40
0
def build_chow_liu_tree(df, abs_weight=True):
    """
    Build a Chow-Liu tree from the data, X. n is the number of features. The weight on each edge is
    the negative of the mutual information between those features. The tree is returned as a networkx
    object.
    """
    G = nx.Graph()
    for u in df.columns:
        G.add_node(u)
        for v in df.columns:
            G.add_edge(u, v, weight=-mutual_info_score(df[u], df[v]))
    T = nx.minimum_spanning_tree(G)
    if abs_weight:
        for u, v, d in T.edges(data=True):
            T[u][v]['weight'] = abs(d['weight'])
    return T
def calc_MI(x, y, bins):
  """
  Calculate the mutual information between two distributions.

  Args:
    x (float array): true distribution
    y (float array): predicted distribution
    bins (int): number of bins to use in building a histogram of x and y

  Returns:
    float: mutual information between x and y
  """

  c_xy = np.histogram2d(x, y, bins)[0]
  mi = mutual_info_score(None, None, contingency=c_xy)
  return mi
def test_mutualInformation():
    # This test verifies that the getMutualInfo() function returns correct values, comparing them with the ones
    # by the sklearn library

    cardinalities = [2, 2, 2, 3]
    population = [[1, 0, 1, 0], [1, 0, 1, 0], [1, 0, 1, 0], [0, 1, 1, 1],
                  [0, 1, 0, 1], [1, 1, 1, 1], [0, 0, 1, 0], [1, 1, 1, 1]]
    ad.init(0.0, len(population[0]), population, cardinalities)
    ad.addFrequenciestoMatrix(population)
    print("Mutual information with our algorithm:" +
          str(ad.getMutualInfo(1, 2)))
    print("Mutual information with SKLearn method:" + str(
        mutual_info_score(
            np.transpose(population)[1],
            np.transpose(population)[2])))
    print("\n")
Example #43
0
def mutual_information(labels_true, labels_pred):
    """Mutual information of distributions in format of pd.Series or pd.DataFrame.

    Args:
        labels_true: Series or DataFrame
        labels_pred: Series or DataFrame
    """
    if isinstance(labels_true, pd.DataFrame):
        labels_true = labels_true.astype(str).apply(lambda x: ' '.join(x.tolist()), axis=1)
    if isinstance(labels_pred, pd.DataFrame):
        labels_pred = labels_pred.astype(str).apply(lambda x: ' '.join(x.tolist()), axis=1)

    assert isinstance(labels_true, pd.Series)
    assert isinstance(labels_pred, pd.Series)

    return mutual_info_score(labels_true.astype(str), labels_pred.astype(str))
Example #44
0
def image_compare_mutualInfo(img1, img2):
    '''
    mutual info similarity
    :param img1: image path
    :param img2: image path
    :return: mutual info similarity value
    '''
    img1 = imread(img1)
    img2 = imread(img2)

    img2 = np.resize(img2, (img1.shape[0], img1.shape[1], img1.shape[2]))

    img1 = np.reshape(img1, -1)
    img2 = np.reshape(img2, -1)
    mutual_infor = mr.mutual_info_score(img1, img2)
    return mutual_infor
Example #45
0
def parse_seq(filename):
    seqs=utils.read_file(filename)
    X=[seq_i.split("#")[0] for seq_i in seqs]
    X=[count(x_i) for x_i in X]
	#X=[to_ngram(x_i) for x_i in X]
    y=[seq_i.split("#")[1] for seq_i in seqs]
    y=[int(y_i) for y_i in y]
    n_cats,cat_names=find_n_cats(X)
    np_hist=create_histogram(X,n_cats,cat_names)
    indicators=[get_indicator(i,np_hist) for i in range(0,n_cats)] 
    cats=[get_category(i,y) for i in range(0,n_cats)] 
    entropy_matrix=np.zeros((len(cats),len(indicators)))
    for i,cat_i in enumerate(cats):
        for j,indic_j in enumerate(indicators):
            entropy_matrix[i][j]=metrics.mutual_info_score(cat_i,indic_j)
    print(entropy_matrix)
Example #46
0
def sort_pivots(dict_words, data, labels):
    print("Sorting potential pivots...")
    sorted_pivots = []
    num_features = data.shape[1]
    info_scores = []
    for i in range(num_features):
        info = mutual_info_score(data[:, i], labels)
        info_scores.append(info)

    info_scores_sorted = sorted(range(len(info_scores)),
                                key=lambda i: info_scores[i],
                                reverse=True)
    for i in range(num_features):
        sorted_pivots.append(
            dict_words.get_feature_names()[info_scores_sorted[i]])

    return sorted_pivots
Example #47
0
def main():
	args = parse_args ()

	my_data = np.genfromtxt(args.file, delimiter=',', skip_header=1, usecols=(2,3,4,5,6,7), dtype=int)
	data_true = np.genfromtxt(args.file, delimiter=',', skip_header=1, usecols=(1), dtype=int)
	
	#mean and std diviation for lenghts
	mean_l, std_div_l =  min_max_lenght (my_data)

	lables = DBSCAN (my_data, args.minPts, args.eps)
	eps = args.eps
	minPts = args.minPts
	max_noise_level = 0.5
	
	lables_without_noise = lables [lables != -1]
	my_data_without_noise = my_data [lables != -1]
	silhouette = silhouette_score (my_data_without_noise, lables_without_noise)
	
	#trying to find best fitting eps and minPts, using silhouette score
	for eps_cur in  np.linspace (mean_l - 2*std_div_l, mean_l + 2*std_div_l, args.num_stad):
		for minPts_cur in np.arange (args.min, args.max, 1):
			cur_lables = DBSCAN (my_data, minPts_cur, eps_cur)
			
			my_data_without_noise = my_data [cur_lables != -1]
			lables_without_noise = cur_lables [cur_lables != -1]
			cur_silhouette = silhouette_score (my_data_without_noise, lables_without_noise)
			
			if cur_silhouette > silhouette and 1.0*(cur_lables.size - lables_without_noise.size)/cur_lables.size < max_noise_level :
				lables = cur_lables
				silhouette = cur_silhouette
				eps = eps_cur
				minPts = minPts_cur
				
	#our data without noise
	lables_without_noise = lables [lables != -1]
	my_data_without_noise = my_data [lables != -1]
	data_true_without_noise = data_true [lables != -1]
	
	print "eps = ", eps, ", minPts = ", minPts
	print "silhouette = ", silhouette
	print "number of noise = ", 1.0 *(lables.size - lables_without_noise.size)/lables.size
	print "number of clasters = ", np.unique (lables_without_noise).size
	print "purity = ", purity (lables_without_noise, data_true_without_noise)
	print "rand index = ", rand_index (lables_without_noise, data_true_without_noise)
	print "mutual info = ", mutual_info_score (lables_without_noise, data_true_without_noise)
	pass
Example #48
0
    def normalized_mutual_information_skl(self, fa, fb, bins=10):
        # NMI(A,B) := I(A;B) / H(A,B)

        if self.project(fa).entropy() == 0.0: return 0.0
        if self.project(fb).entropy() == 0.0: return 0.0
        # floating point issues

        distAB = self.project(lambda x: (fa(x), fb(x)))
        dbA = [val[0] for (val, prob) in distAB]
        dbB = [val[1] for (val, prob) in distAB]
        prob = [prob for (val, prob) in distAB]

        from sklearn.metrics import mutual_info_score

        c_xy = np.histogram2d(dbA, dbB, bins, weights=prob)[0]
        mi = mutual_info_score(None, None, contingency=c_xy)
        return mi
Example #49
0
def compute_mutual_information(prediction, target):
    """Calculates mutual information between target and prediction
    
    Parameters
    ----------
    prediction : torch.Tensor
      Predicted image
    target : torch.Tensor
      Target image
    """
    from sklearn.metrics import mutual_info_score
    p_xy, _, _ = np.histogram2d(prediction.data.cpu().numpy().flatten(),
                                target.data.cpu().numpy().flatten(),
                                bins=256,
                                range=((0, 1), (0, 1)),
                                normed=True)
    return mutual_info_score(None, None, contingency=p_xy)
Example #50
0
 def mutual_information(self,
                        position1,
                        position2,
                        family=False,
                        normalized=False,
                        adjusted=False):
     labels_position1 = self.data_row(position1, family)
     labels_position2 = self.data_row(position2, family)
     if normalized == True:
         MI = metrics.normalized_mutual_info_score(labels_position1,
                                                   labels_position2)
     elif adjusted == True:
         MI = metrics.adjusted_mutual_info_score(labels_position1,
                                                 labels_position2)
     else:
         MI = metrics.mutual_info_score(labels_position1, labels_position2)
     return MI
Example #51
0
def compute_MI(x, y, bins):
    """ Compute mutual information between two vectors given custom bins.

    Parameters
    ----------
    x, y : array, 1D
        Signals to compute mutual information between.
    bins : integer or array, 1D
        Number of bins (if integer) or bin edges (if array) for 2D histogram.

    Returns
    -------
    MI : float
        Mutual information estimate.
    """
    c_xy = np.histogram2d(x, y, bins)[0]
    return mutual_info_score(None, None, contingency=c_xy)
Example #52
0
def evaluatePerSample(self, sampleNo):
    AMI = metrics.adjusted_mutual_info_score(self.labelsTrue, self.labelsPred)
    NMI = metrics.normalized_mutual_info_score(self.labelsTrue,
                                               self.labelsPred)
    MI = metrics.mutual_info_score(self.labelsTrue, self.labelsPred)
    ARI = metrics.adjusted_rand_score(self.labelsTrue, self.labelsPred)
    homogeneity = metrics.homogeneity_score(self.labelsTrue, self.labelsPred)
    completeness = metrics.completeness_score(self.labelsTrue, self.labelsPred)
    V = metrics.v_measure_score(self.labelsTrue, self.labelsPred)
    #        SC = metrics.silhouette_score(self.X, self.labelsPred, metric='sqeuclidean') #Silhouette Coefficient
    self.AMIList.append(AMI)
    self.NMIList.append(NMI)
    self.MIList.append(MI)
    self.ARIList.append(ARI)
    self.homogeneityList.append(homogeneity)
    self.completenessList.append(completeness)
    self.VList.append(V)
Example #53
0
def get_clustering_scoring(y, y_pred):
    scoring = {}
    if y is None:
        return scoring
    try:
        scoring['adjusted_mutual_info_score'] = \
            metrics.adjusted_mutual_info_score(y, y_pred)
    except Exception:
        pass
    try:
        scoring['adjusted_rand_score'] = \
            metrics.adjusted_rand_score(y, y_pred)
    except Exception:
        pass
    try:
        scoring['completeness_score'] = \
            metrics.completeness_score(y, y_pred)
    except Exception:
        pass
    try:
        scoring['fowlkes_mallows_score'] = \
            metrics.fowlkes_mallows_score(y, y_pred)
    except Exception:
        pass
    try:
        scoring['homogeneity_score'] = \
            metrics.homogeneity_score(y, y_pred)
    except Exception:
        pass
    try:
        scoring['mutual_info_score'] = \
            metrics.mutual_info_score(y, y_pred)
    except Exception:
        pass
    try:
        scoring['normalized_mutual_info_score'] = \
            metrics.normalized_mutual_info_score(y, y_pred)
    except Exception:
        pass
    try:
        scoring['v_measure_score'] = \
            metrics.v_measure_score(y, y_pred)
    except Exception:
        pass
    return scoring
    def __init__(self, X, Y, method="ICAP"):
        """
        This class provides easy access to mutual information based filter feature selection.
        The default mutual information estimation algorithm used is the histogram binning method. If a more
        sophisticated approach is required, use the change_MI_estimator function to apply your own method.

        :param X: (n_samples, n_features) numpy array containing the training data
        :param Y: (n_samples) numpy array containing target labels
        :param method: filter criterion that will be applied to select the features. Available criteria are: (as string)
                       "CIFE" [Lin1996], "ICAP" [Jakulin2005], "CMIM" [Fleuret2004], "JMI"[Yang1999]
        """
        if X.shape[0] != len(Y):
            raise ValueError("X must have as many samples as there are labels in Y")

        self._n_features = X.shape[1]

        def normalize_data_for_MI(X):
            for i in range(X.shape[1]):
                std = X[:, i].std()
                if std != 0.:
                    X[:, i] /= std
                    X[:, i] -= X[:, i].min()
            return np.floor(X).astype("int")
        
        self._X = normalize_data_for_MI(np.asarray(X))
        self._Y = np.asarray(Y)
        
        self._method_str = method
        self._methods = {
            "CIFE": self.__J_CIFE,
            "ICAP": self.__J_ICAP,
            "CMIM": self.__J_CMIM,
            "JMI": self.__J_JMI,
            "mRMR": self.__J_mRMR,
            "MIFS": self.__J_MIFS
        }
        self._filter_criterion_kwargs = {}
        self.change_method(method)
        self._method = self._methods[method]
        self._mutual_information_estimator = lambda X1, X2: mutual_info_score(X1,X2)/np.log(2.0)

        self._redundancy = np.zeros((self._n_features, self._n_features)) - 1.
        self._relevancy = np.zeros((self._n_features)) - 1
        self._class_cond_red = np.zeros((self._n_features, self._n_features)) - 1
        self._class_cond_mi_method = self._calculate_class_conditional_MI
Example #55
0
    def mutual_information(self, max_lag):
        """
        Uses numpy's mutual information
        """

        digi = utilities.mi_digitize(self.X)

        mi = np.empty(max_lag)

        for i in range(max_lag):

            ind = i + 1
            unshift = digi[ind:]
            shift = digi[0:-ind]

            mi[i] = skmetrics.mutual_info_score(unshift, shift)

        return mi
Example #56
0
def algorithm(X_train, X_test, y_train, y_test):
    clf_dt = tree.DecisionTreeClassifier()
    clf_dt.fit(X_train, y_train)

    predicted_y = clf_dt.predict(X_test)
    clf_dt.score(X_test, y_test)

    mutual_score = mutual_info_score(y_test, predicted_y)
    roc_score2 = roc_auc_score(y_test, predicted_y)

    X_train['Unnamed: 5'] = X_train['Unnamed: 5'].astype(float)
    X_train['Unnamed: 6'] = X_train['Unnamed: 6'].astype(float)
    X_test['Unnamed: 5'] = X_test['Unnamed: 5'].astype(float)
    X_test['Unnamed: 6'] = X_test['Unnamed: 6'].astype(float)

    print('mutual score = ', mutual_score)
    print('roc_auc_score = ', roc_score2)
    return mutual_score, roc_score2
Example #57
0
def _mutual_info_compute(when, then, df):
    """
    Computes the metric results.

    :param when:
    :type when: str/int
    :param then:
    :type then: str/int
    :param df:
    :type df: DataFrame
    :return: Result of the metric.
    """
    index = (df[when].isna()) | (df[then].isna())
    index = ~index
    if sum(index) > 0:
        return mutual_info_score(df[when][index], df[then][index])
    else:
        return 0
Example #58
0
def find_tau(data):
    data = data.reshape((len(data), 1))
    kbins = KBinsDiscretizer(n_bins=1000, encode='ordinal', strategy='uniform')
    data_trans = kbins.fit_transform(data)

    # find usable time delay via mutual information
    tau_max = 50
    mis = []
    data = data_trans.reshape(len(data_trans, ))

    for tau in range(1, tau_max):
        unlagged = data[:-tau]
        lagged = np.roll(data, -tau)[:-tau]
        mis.append(metrics.mutual_info_score(lagged, unlagged))

        if len(mis) > 1 and mis[-2] < mis[-1]:  # return first local minima
            tau -= 1
            return tau
Example #59
0
def select_using_MI(features, labels, threshold=0.01, ham_label=-1):
    '''
    Returns indices of the most salient features for the ham class, using a
    mutual information score between feature values and class label, and
    from the highest scoring, filtering the ones that are most present
    in spam relatively. This makes sense since we then use these indices
    to choose which features to turn on in emails

    TODO or I could keep all the highest MI score features, and if more
         present in ham, I set it to 1, else I set it to zero
         requires extra array somewhere though
         Thinking about it, this is already what is happening since the ham
         malicious instances already have all their features set to zero

         (but this proportion of features controlled by the attacker could
         vary ? depending on attacket's dataset knowledge ? ie. malicious
         instances' feature values could be initialised randomly, or drawn from a
         spammy distribution to mimick an email that still has a malicious
         potential (although this isn't necessary since we are doing a poison,
         not evasion attack); then compare this to initialised with 0s or with 1s
    '''

    X, Y = features, np.ravel(labels)
    N, D = X.shape
    d = int(D * threshold) ## keep top d features with highest score
    tls.logger.info('Keep top %s features' % d)

    ## calculate frequency of feature presence relative to each class
    ham_freq  = np.mean(X[np.ravel(Y == ham_label)], axis=0)
    spam_freq = np.mean(X[np.ravel(Y != ham_label)], axis=0)
    tls.logger.debug('- feature frequency in ham class: %s' % ham_freq)
    tls.logger.debug('- feature frequency in spam class: %s' % spam_freq)

    ## calculate mutual information between features and labels
    MI_per_feature = (mutual_info_score(X[:, f], Y) for f in range(D))
    MI_per_feature = np.fromiter(MI_per_feature, dtype=np.float16)
    tls.logger.debug('- mutual information scores: %s' % MI_per_feature)

    ## keep only salient features for ham (according to relative presence in that class)
    MI_per_feature[ham_freq < spam_freq] = 0
    salient_indices = np.argpartition(MI_per_feature, -d)[-d:]
    ## ^ https://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array/20177786#20177786

    return salient_indices
Example #60
0
def calc_MI(x, y, bins=[25,25], maxvalues=41):
    "Calculates the mutual information between two variables."

    # maxvalues determines the cutoff between real and discrete
    # for discrete information it makes sense to use the number of unique values as the number of bins
    # if there are only two classes we want to use two bins not say 25 bins
    num_unique_values_x = pd.value_counts(x).size
    num_unique_values_y = pd.value_counts(y).size
    if num_unique_values_x < maxvalues: 
        bins[0] = num_unique_values_x
    if num_unique_values_y < maxvalues: 
        bins[1] = num_unique_values_y
	    
    # create the 2d histogram needed to calculate the mutual information via sklearn
    # [0] gets the histogram, [1] & [2] are binning info.
    c_xy = np.histogram2d(x, y, bins=bins)[0]
    # use scikit learn to calculate the mi value
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi