def select_using_MI(features, labels, threshold=.1, ham_label=0): ''' Returns indices of the most salient features for the ham class, using a mutual information score between feature values and class label, and from the highest scoring, filtering the ones that are most present in spam relatively. This makes sense since we then use these indices to choose which features to turn on in emails ''' X, Y = features, np.ravel(labels) N, D = X.shape d = int(D * threshold) ## percentile of features to keep ## calculate frequency of feature presence relative to each class ham_freq = np.mean(X[np.ravel(Y == ham_label)], axis=0) spam_freq = np.mean(X[np.ravel(Y != ham_label)], axis=0) ## calculate mutual information between features and labels MI_per_feature = (mutual_info_score(X[:, f], Y) for f in range(D)) MI_per_feature = np.fromiter(MI_per_feature, dtype=np.float16) ## keep only salient features for ham (according to relative presence in that class) MIs = MI_per_feature[ham_freq > spam_freq] salient_indices = np.argpartition(MIs, -d)[-d:] ## ^ https://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array/20177786#20177786 return salient_indices
def mutualInformation_1to1(x, y, bins): """Computation of the mutual information between two time-series. Parameters ---------- x: array_like, shape (N,) time series to compute difference. y: array_like, shape (N,) time series to compute difference. bins: int or tuple of ints or tuple of arrays the binning information. Returns ------- mi: float the measure of mutual information between the two time series. """ ## 1. Discretization if bins is None: # Compute contingency matrix pass else: c_xy = np.histogram2d(x, y, bins)[0] ## 2. Compute mutual information from contingency matrix mi = mutual_info_score(None, None, contingency=c_xy) return mi
def ami(x, y=None, n_bins=10): """Calculate the average mutual information between $x(t)$ and $y(t)$. Parameters ---------- x : array-like y : array-like, optional $x(t)$ and $y(t)$. If only `x` is passed, it must have two columns; the first column defines $x(t)$ and the second $y(t)$. n_bins : int The number of bins to use when computing the joint histogram. Returns ------- scalar Average mutual information between $x(t)$ and $y(t)$, in nats (natural log equivalent of bits). See Also -------- lagged_ami References ---------- Arbanel, H. D. (1996). *Analysis of Observed Chaotic Data* (p. 28). New York: Springer. """ x, y = _vector_pair(x, y) if x.shape[0] != y.shape[0]: raise ValueError('timeseries must have the same length') return metrics.mutual_info_score(None, None, contingency=np.histogram2d(x, y, bins=n_bins)[0])
def calc_MI(x, y, bins): "Calculates the mutual information between two variables." # [0] gets the histogram, [1] & [2] are binning info. c_xy = np.histogram2d(x, y, bins)[0] # use scikit learn to calculate the binning information mi = mutual_info_score(None, None, contingency=c_xy) return mi
def __treemimic__(fittest, n_objects): fittest_count, last_drawn, medoid, counters, remaining, funcs = __mimic_setup__(fittest, n_objects) included = filter(lambda x: x not in remaining, xrange(n_objects)) dependency_tree = {x: [] for x in xrange(n_objects)} compiled_mutual_information = np.ones((n_objects, n_objects), dtype=np.float) for i in xrange(n_objects): for j in xrange(0, i + 1): compiled_mutual_information[i, j] = mutual_info_score(medoid[i], medoid[j]) # entropy(medoid[i], counters[i], medoid[j], counters[j]) compiled_mutual_information[j, i] = compiled_mutual_information[i, j] # ---------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------- while len(remaining) > 0: smallest_entropy = np.inf for r in remaining: for i in included: some_entropy = compiled_mutual_information[r, i] if some_entropy < smallest_entropy: smallest_entropy = some_entropy drawn = r dependency_tree[last_drawn].append(drawn) included.append(drawn) last_drawn, remaining, funcs = __mimic_link__(fittest_count, last_drawn, drawn, counters, remaining, funcs) return dict(funcs), included, dependency_tree
def mi(x, y, bins=10): """Mutual information between x and y""" H_x = u.compute_entropy(np.histogram(x, bins)[0]) H_y = u.compute_entropy(np.histogram(y, bins)[0]) c_xy = np.histogram2d(x, y, bins)[0] mi = skm.mutual_info_score(None, None, contingency=c_xy) return mi / np.sqrt(H_x * H_y)
def mutual_information(x, y, bins=1000): """ Return the mutual information of the data in x, y """ hist, xedges, yedges = np.histogram2d(x, y, bins=bins) #hist /= len(x) return mutual_info_score(None, None, contingency=hist)
def MI_lag( bins , x , y , max_lag): #first need to ensure vectors are made the same length #always ensure that y is the wind vector x is the sand len_diff = len(y) - len(x) print len_diff if len_diff >0: y = y[:-len_diff] elif len_diff < 0: x = x[:len_diff] mi_vec = [] for lag in range(max_lag): x_info = x[lag:] y_info = y[0: len(y) - lag] c_xy = np.histogram2d(x_info, y_info, bins)[0] mi_val = mutual_info_score(None, None, contingency=c_xy) mi_vec.append(mi_val) print lag , len(mi_vec) return np.array(mi_vec)
def __mimic__(fittest, n_objects): fittest_count, last_drawn, medoid, counters, remaining, funcs = __mimic_setup__(fittest, n_objects) dependency_tree = {x: [] for x in xrange(n_objects)} # ---------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------- sampling_order = [last_drawn] while len(remaining) > 0: entropies = [] for x in remaining: entropies += [mutual_info_score(medoid[x], medoid[last_drawn])] # entropies = map( # lambda x: entropy(medoid[x], counters[x], medoid[last_drawn], counters[last_drawn]), # remaining # ) drawn = remaining[np.argmin(entropies)] sampling_order.append(drawn) dependency_tree[last_drawn].append(drawn) last_drawn, remaining, funcs = __mimic_link__(fittest_count, last_drawn, drawn, counters, remaining, funcs) return dict(funcs), sampling_order, dependency_tree
def NMI_by_max(labels,cluster_labels,label_H=None): if not label_H: label_H = label_entropy(labels) cluster_H = label_entropy(cluster_labels) MI = metrics.mutual_info_score(labels,cluster_labels) return MI/max(label_H,cluster_H)
def NMI_by_mean(labels,cluster_labels,label_H=None): if not label_H: label_H = label_entropy(labels) cluster_H = label_entropy(cluster_labels) MI = metrics.mutual_info_score(labels,cluster_labels) return 2.0*MI/(label_H + cluster_H)
def bench_cluster(X, y, pca_n_comp): n = len(np.unique(y)) pca = PCA(pca_n_comp) X_ = pca.fit_transform(X) sc = SpectralClustering(n) km = KMeans(n) sc_pred = sc.fit_predict(X_) km_pred = km.fit_predict(X_) distances = PairwiseDistances(X_.tolist()) distances = ExplicitDistances(distances) singlel_pred = fcluster(linkage(ssd.squareform(distances.distances)), n, criterion='maxclust') print "single-linkage clustering prediction:", singlel_pred print "single-linkage clustering score:", adjusted_rand_score(y, singlel_pred), mutual_info_score(y, singlel_pred) print "spectral clustering prediction:", sc_pred print "spectral clustering score:", adjusted_rand_score(y, sc_pred), mutual_info_score(y, sc_pred) print "kmeans clustering prediction", km_pred print "kmeans clustering score:", adjusted_rand_score(y, km_pred), mutual_info_score(y, km_pred) print "ground truth labels", y
def get_feature_mi(data, threshold): correlation_dict = defaultdict(list) for feature in data.columns: for other in data.columns: if other == feature: continue mi = metrics.mutual_info_score(data[feature].values, data[other].values) if mi >= threshold: correlation_dict[feature].append(other) return correlation_dict
def chow_liu_tree(y_): n_labels = y_.shape[1] mi = np.zeros((n_labels, n_labels)) for i in range(n_labels): for j in range(n_labels): mi[i, j] = mutual_info_score(y_[:, i], y_[:, j]) mst = minimum_spanning_tree(csr_matrix(-mi)) edges = np.vstack(mst.nonzero()).T edges.sort(axis=1) return edges
def mutual_information(self,max_lag): """ Calculates the mutual information between the an unshifted time series and a shifted time series. Utilizes scikit-learn's implementation of the mutual information found in sklearn.metrics. Parameters ---------- X : 1-D array time series that is to be shifted over max_lag : integer maximum amount to shift the time series Returns ------- m_score : 1-D array mutual information at between the unshifted time series and the shifted time series """ #number of bins - say ~ 20 pts / bin for joint distribution #and that at least 4 bins are required N = max(self.X.shape) num_bins = max(4.,np.floor(np.sqrt(N/20))) num_bins = int(num_bins) m_score = np.zeros((max_lag)) for jj in range(max_lag): lag = jj+1 ts = self.X[0:-lag] ts_shift = self.X[lag:] min_ts = np.min(self.X) max_ts = np.max(self.X)+.0001 #needed to bin them up bins = np.linspace(min_ts,max_ts,num_bins+1) bin_tracker = np.zeros_like(ts) bin_tracker_shift = np.zeros_like(ts_shift) for ii in range(num_bins): locs = np.logical_and( ts>=bins[ii], ts<bins[ii+1] ) bin_tracker[locs] = ii locs_shift = np.logical_and( ts_shift>=bins[ii], ts_shift<bins[ii+1] ) bin_tracker_shift[locs_shift]=ii m_score[jj] = skmetrics.mutual_info_score(bin_tracker,bin_tracker_shift) return m_score
def main(): ## Synthetic Example clustering1 = { '1': {'1', '2', '3', '4'}, '2': {'1', '2', '3', '4'}, '3': {'1', '2', '3', '4'}, '4': {'1', '2', '3', '4'}, '5': {'5', '6', '7'}, '6': {'5', '6', '7'}, '7': {'5', '6', '7'}, } clustering2 = { '1': {'1', '2', '3'}, '2': {'1', '2', '3'}, '3': {'1', '2', '3'}, '4': {'4', '5', '6', '7'}, '5': {'4', '5', '6', '7'}, '6': {'4', '5', '6', '7'}, '7': {'4', '5', '6', '7'}, } labels1_dict = links_to_labels(clustering1) labels2_dict = links_to_labels(clustering2) clustering_combined = consensus_clustering([clustering1, clustering2]) labels_combined_dict = clusters_to_labels(clustering_combined) labels1 = [] labels2 = [] labelsCombined = [] for doc_id in range(1, len(labels1_dict) + 1): labels1.append(labels1_dict[str(doc_id)]) labels2.append(labels2_dict[str(doc_id)]) labelsCombined.append(labels_combined_dict[str(doc_id)]) print('Mutual information between labels1 and labels2 is ' + str(mutual_info_score(labels1, labels2))) ## Real data print('Loading HT data') phone_clusters = pickle.load(open('Consensus/WadePresentation/phone_cluster.pkl', 'rb')) text_clusters = pickle.load(open('Consensus/WadePresentation/text_cluster.pkl', 'rb')) print('Converting to labels') phone_labels_dict = links_to_labels(phone_clusters) text_labels_dict = links_to_labels(text_clusters) phone_labels = [] text_labels = [] for doc_id in range(0, len(phone_labels_dict)): phone_labels.append(phone_labels_dict[str(doc_id)]) text_labels.append(text_labels_dict[str(doc_id)]) print('Running consensus clustering...') clustering_combined = consensus_clustering([phone_clusters, text_clusters]) print('Number of consensus clusters: ' + str(len(clustering_combined))) nrecords = 0 for cluster in clustering_combined: nrecords += len(cluster) print('Checking number of records: ' + str(nrecords))
def clusterize(models): labels_true = [2, 0, 3, 2, 2, 0, 1, 2, 3] clusters = KMeans(n_clusters=4).fit_predict(models) rand = metrics.adjusted_rand_score(labels_true, clusters) mi = metrics.mutual_info_score(labels_true, clusters) h**o = metrics.homogeneity_score(labels_true, clusters) completeness = metrics.completeness_score(labels_true, clusters) scores_list = [rand, mi,h**o,completeness] scores_names = ['Приведенный индекс Ранда', 'Коэф. взаимной информации', 'Гомогенность', 'Полнота кластеров'] scores = pd.DataFrame(data=scores_list, index=scores_names, dtype=object) return clusters, scores
def _generate_mutual_information_graph(self): samples = np.asarray(self.samples) complete_graph = nx.complete_graph(samples.shape[1]) for edge in complete_graph.edges(): mutual_info = mutual_info_score( samples[:, edge[0]], samples[:, edge[1]] ) complete_graph.edge[edge[0]][edge[1]]['weight'] = -mutual_info return complete_graph
def chow_liu(X): """ Chow-Liu structure learning algorithm. :param X: dataset :return: the learned graph (tree) """ n_objects = X.shape[0] n_vars = X.shape[1] g = nx.complete_graph(n_vars) for i, j in g.edges(): g.edge[i][j]['mutual information'] = mutual_info_score(X[:, i], X[:, j]) g = maximum_spanning_tree(g, weight='mutual information') return g
def get_score(node_set, data_dict, label): ''' Calculate Mutual information score ''' mat = [] for each in node_set: mat.append(data_dict[each]) mat = np.array(mat) mean_col = mat.mean(axis=0) count = int(np.log2(len(mean_col))+1) bins = np.histogram(mean_col,count)[1] index = np.digitize(mean_col,bins) values = [bins[i-1] for i in index] score = metrics.mutual_info_score(label,values) return score
def get_score(node_set, data_dict, label): ''' Calculate Mutual information score ''' mat = [] for each in node_set: mat.append(data_dict[each]) mat = np.array(mat) mean_col = mat.mean(axis=0) count = int(np.log2(len(mean_col)) + 1) bins = np.histogram(mean_col, count)[1] index = np.digitize(mean_col, bins) values = [bins[i - 1] for i in index] score = metrics.mutual_info_score(label, values) return score
def calculateMI_Plane(x, t, y, binCount = 24): # Push T into bins # T_bins = np.zeros((len(t), binCount)) T_Nums = np.zeros(len(t)) X_Nums = np.zeros(len(x)) Y_Nums = np.zeros(len(y)) for i in range(len(t)): # T_bins[i] = np.histogram(t[i], bins=binCount, range=(-1.0, 1.0))[0] # throw out bin edges T_Nums[i] = array_to_number(np.histogram(t[i], bins=binCount, range=(-1.0, 1.0))[0]) X_Nums[i] = array_to_number(x[i]) Y_Nums[i] = array_to_number(y[i]) # cprint("- X: %d" % X_Nums[i], 'green') # cprint("- T: %d" % T_Nums[i], 'yellow') # cprint("- Y: %d" % Y_Nums[i], 'red') # probXT = calculate_jointProb(X_Nums, T_Nums) # probTY = calculate_jointProb(T_Nums, Y_Nums) # cprint(probXT, 'yellow') # cprint(probTY, 'cyan') # return (mutual_info_score(None, None, contingency=probXT), mutual_info_score(None, None, contingency=probTY)) return (mutual_info_score(X_Nums, T_Nums), mutual_info_score(T_Nums, Y_Nums))
def get_mutual_information(data, attributes, labels): totalMutInfo = 0.0 bins = 9 length = len(data[0]) for j in range(length): if (is_numeric(attributes[j])): try: col = pd.cut(data[:, j], bins) except: col = data[:, j] else: col = data[:, j] mutInfo = mutual_info_score(col, labels) totalMutInfo = totalMutInfo + mutInfo return totalMutInfo / len(data[0])
def CalculateNormalizedMI(self, pathFile): listMI = [] mi_df = pd.read_csv(pathFile) columns_name = mi_df.columns for att1 in columns_name: listAnMI = [att1] column1 = mi_df[att1].to_numpy() mi1 = metrics.mutual_info_score(column1, column1) if mi1 == 0.0: mi1=1 for att2 in columns_name: if att1 == att2: nmi2 = 1.0 else: column2 = mi_df[att2].to_numpy() mi2 = metrics.mutual_info_score(column1, column2) nmi2 = mi2 / mi1 listAnotherAtt = [att2, nmi2] listAnMI.append(listAnotherAtt) listMI.append(listAnMI) return listMI
def maxMI(labels,clust): max_MI = 0.0 for i in range(clust.num_leaves): clusters = clust[i] cluster_labels = np.zeros(len(labels)) for index,cluster_IDs in enumerate(clusters): cluster_labels[cluster_IDs] = index MI = metrics.mutual_info_score(labels,cluster_labels) if MI > max_MI: max_MI = MI return max_MI
def test_discrete_mutual_info(cases: str) -> None: seed = 123456 cases = int(cases) i = 1 N_decimal = 4 while i < cases: random_labels = np.random.RandomState(seed).randint n_samples = np.random.randint(1, 100) n_classes = np.random.randint(1, 10) labels_a = random_labels(low=0, high=n_classes, size=n_samples) labels_b = random_labels(low=0, high=n_classes, size=n_samples) mine_MI = discrete_mutual_info(labels_a, labels_b) gold_MI = mutual_info_score(labels_a, labels_b) assert_almost_equal(mine_MI, gold_MI, decimal=N_decimal) i += 1
def mutual_information(x, y, nbins=32, normalized=False): """ Compute mutual information :param x: 1D numpy.array : flatten data from an image :param y: 1D numpy.array : flatten data from an image :param nbins: number of bins to compute the contingency matrix (only used if normalized=False) :return: float non negative value : mutual information """ if normalized: mi = normalized_mutual_info_score(x, y) else: c_xy = np.histogram2d(x, y, nbins)[0] mi = mutual_info_score(None, None, contingency=c_xy) return mi
def compute_seq_mi_across_layers(layers_output, layer, bins=128): l = np.digitize( layers_output[layer], np.linspace(np.min(layers_output[layer]), np.max(layers_output[layer]), bins)) n = np.digitize( layers_output[layer + 1], np.linspace(np.min(layers_output[layer + 1]), np.max(layers_output[layer + 1]), bins)) layer_quantizied, layer_idx = np.unique(l, axis=0, return_inverse=True) next_quantizied, next_idx = np.unique(n, axis=0, return_inverse=True) return mutual_info_score(None, None, contingency=np.histogram2d(layer_idx, next_idx)[0])
def calculate_extrinsic_metrics(dataset, real_classes, predicted_classes): confusion_matrix = matriz_confusion(real_classes, predicted_classes) return { 'Error': medida_error(confusion_matrix), 'Pureza': medida_pureza(confusion_matrix), 'F1': medida_f1(confusion_matrix), 'Entropía': medida_entropia(confusion_matrix), 'Información mútua': metrics.mutual_info_score(real_classes, predicted_classes), 'ARI': metrics.adjusted_rand_score(real_classes, predicted_classes), 'Homogeneidad': metrics.homogeneity_score(real_classes, predicted_classes), 'Completación': metrics.completeness_score(real_classes, predicted_classes), 'Medida V': metrics.v_measure_score(real_classes, predicted_classes), 'Fowlkes-Mallows': metrics.fowlkes_mallows_score(real_classes, predicted_classes), 'Silhouette': metrics.silhouette_score(dataset, predicted_classes, metric='euclidean'), 'Calinski-Harabasz': metrics.calinski_harabasz_score(dataset, predicted_classes), 'Davies-Bouldin': davies_bouldin_score(dataset, predicted_classes), 'media': (medida_pureza(confusion_matrix) + medida_f1(confusion_matrix) + metrics.mutual_info_score( real_classes, predicted_classes) + metrics.adjusted_rand_score(real_classes, predicted_classes) + metrics.homogeneity_score( real_classes, predicted_classes) + metrics.completeness_score(real_classes, predicted_classes) + metrics.v_measure_score( real_classes, predicted_classes) + metrics.fowlkes_mallows_score(real_classes, predicted_classes)) / 8 }
def fit(self, X, y=None): self.num_feat_ = X.shape[1] g = nx.Graph() g.add_nodes_from(range(self.num_feat_)) for i in range(self.num_feat_): for j in range(i + 1, self.num_feat_): c_xy = np.histogram2d(X[:, i], X[:, j], self.bins)[0] mi = mutual_info_score(None, None, contingency=c_xy) g.add_edge(i, j, weight=mi) t = nx.minimum_spanning_tree(g) self.edges_ = t.edges() # list of (i,j) tuples self.root_generator.fit(X[:, [0]]) self.generators_ = [self.new_generator(X[:, [i]], X[:, [j]]) for i, j in self.edges_] return self
def check(self, frame_indexs: list): frame1, (x1, y1, w1, h1) = self.get_subtitle(frame_indexs[0]) frame1 = frame1[y1:h1, x1:w1] frame2, (x2, y2, w2, h2) = self.get_subtitle(frame_indexs[1]) frame2 = frame2[y1:h1, x1:w1] mutual_infor = mr.normalized_mutual_info_score(frame1.reshape(-1), frame2.reshape(-1)) print(mutual_infor) mutual_infor = mr.adjusted_mutual_info_score(np.reshape(frame1, -1), np.reshape(frame2, -1)) print(mutual_infor) mutual_infor = mr.mutual_info_score(np.reshape(frame1, -1), np.reshape(frame2, -1)) print(mutual_infor) cv2.imshow(f"frames_{frame_indexs}", np.vstack((frame1, frame2)))
def __init__(self, X, Y, method="ICAP"): """ This class provides easy access to mutual information based filter feature selection. The default mutual information estimation algorithm used is the histogram binning method. If a more sophisticated approach is required, use the change_MI_estimator function to apply your own method. :param X: (n_samples, n_features) numpy array containing the training data :param Y: (n_samples) numpy array containing target labels :param method: filter criterion that will be applied to select the features. Available criteria are: (as string) "CIFE" [Lin1996], "ICAP" [Jakulin2005], "CMIM" [Fleuret2004], "JMI"[Yang1999] """ if X.shape[0] != len(Y): raise ValueError( "X must have as many samples as there are labels in Y") self._n_features = X.shape[1] def normalize_data_for_MI(X): for i in xrange(X.shape[1]): std = X[:, i].std() if std != 0.: X[:, i] /= std X[:, i] -= X[:, i].min() return np.floor(X).astype("int") self._X = normalize_data_for_MI(np.asarray(X)) self._Y = np.asarray(Y) self._method_str = method self._methods = { "CIFE": self.__J_CIFE, "ICAP": self.__J_ICAP, "CMIM": self.__J_CMIM, "JMI": self.__J_JMI, "mRMR": self.__J_mRMR, "MIFS": self.__J_MIFS } self._filter_criterion_kwargs = {} self.change_method(method) self._method = self._methods[method] self._mutual_information_estimator = lambda X1, X2: mutual_info_score( X1, X2) / np.log(2.0) self._redundancy = np.zeros((self._n_features, self._n_features)) - 1. self._relevancy = np.zeros((self._n_features)) - 1 self._class_cond_red = np.zeros( (self._n_features, self._n_features)) - 1 self._class_cond_mi_method = self._calculate_class_conditional_MI
def eval_node_probs(self): """Update probability density estimates. """ # Create mutual info matrix mutual_info = np.zeros([self.length, self.length]) for i in range(self.length - 1): for j in range(i + 1, self.length): # DEBUGGING CODE try: mutual_info[i, j] = -1 * mutual_info_score( self.keep_sample[:, i], self.keep_sample[:, j]) except ValueError: print(f'self.keep_sample[:, i] = {self.keep_sample[:, i]}') print(f'self.keep_sample[:, j] = {self.keep_sample[:, j]}') raise Exception("Caught value error") # Find minimum spanning tree of mutual info matrix mst = minimum_spanning_tree(csr_matrix(mutual_info)) # Convert minimum spanning tree to depth first tree with node 0 as root dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False) dft = np.round(dft.toarray(), 10) # Determine parent of each node parent = np.argmin(dft[:, 1:], axis=0) # Get probs probs = np.zeros([self.length, self.max_val, self.max_val]) probs[0, :] = np.histogram(self.keep_sample[:, 0], np.arange(self.max_val + 1), density=True)[0] for i in range(1, self.length): for j in range(self.max_val): subset = self.keep_sample[np.where( self.keep_sample[:, parent[i - 1]] == j)[0]] if not len(subset): probs[i, j] = 1 / self.max_val else: probs[i, j] = np.histogram(subset[:, i], np.arange(self.max_val + 1), density=True)[0] # Update probs and parent self.node_probs = probs self.parent_nodes = parent
def fit(self, X, y=None): ''' Compute k-means clustering. X is n_samples x n_features. y is n_samples. ''' n_samples, n_features = X.shape # Randomly select K samples as initial centers perm = permutation(n_samples)[:self.n_clusters] C = X[perm] # Nearest neighbor function def nn(x): return argmin(norm(C - x, axis=1)) # Iterate 50 times for i in range(50): # Assign points to nearest cluster centers NN = NearestNeighbors(n_neighbors=1, metric='euclidean').fit(C) dist, ind = NN.kneighbors(X) ind = ind.flatten() # ind = apply_along_axis(nn, 1, X) # Update the cluster centers for k in range(self.n_clusters): C[k] = mean(X[ind == k], axis=0) if self.debug: if isinstance(y, ndarray): print("Class membership counts:") print(unique(y, return_counts=True)[1]) print("Cluster membership counts:") print(unique(ind, return_counts=True)[1]) # Compute the WC_SSD self.WC_SSD = wc_ssd(X, C, ind) # Compute the SC self.SC = sc(X, C, ind) # Compute the NMI if isinstance(y, ndarray): self.NMI = nmi(y, ind) print("WC-SSD %.3f" % self.WC_SSD) print("SC %.3f" % self.SC) print("NMI %.3f" % self.NMI) if self.debug: print("sklearn SC %.3f" % silhouette_score(X, ind)) if isinstance(y, ndarray): print("sklearn NMI %.3f" % mutual_info_score(y, ind)) return ind
def build_bayesian_network(df, k, epsilon): bn = [] V = tuple() attributes = list(df.columns) x = np.random.choice(attributes) bn.append((x, list())) V += (x, ) i = 1 attributes.remove(x) n, d = df.shape S = (2 / n) * np.log2((n + 1) / 2) + (n - 1) / n * np.log2( (n + 1) / (n - 2)) delta = (d - 1) * S / epsilon while attributes: if i <= k: parent_combos = [V] i += 1 else: parent_combos = combinations(V, k) omega = dict() for x in attributes: for combo in parent_combos: parent_values = [ '\t'.join(map(str, tuple(row))) for index, row in df[[x for x in combo]].iterrows() ] mi_score = mutual_info_score(parent_values, df[x]) omega[(x, combo)] = mi_score omega_private = { key: np.exp(omega[key] / (2 * delta)) for key in omega } total_omega_private = sum(omega_private.values()) omega_private = { key: value / total_omega_private for key, value in omega_private.items() } keys = list(omega_private.keys()) max_pair_index = np.random.choice(list(range(len(keys))), p=list(omega_private.values())) max_pair = keys[max_pair_index] #max_pair = max(omega, key=omega.get) #NOTE: NOT PRIVATE RIGHT NOW #print(max_pair) bn.append(max_pair) V += (max_pair[0], ) #Need to remove whatever we added as we go attributes.remove(max_pair[0]) return bn
def get_metrics(test_image, true_image, top_k, total_size): """Computes multiple different metrics between two images. We compute a variety of metrics on the input image: we output L1 and L2 distances, Wasserstein (earth movers) distance, hotspot count and f1 score for the provided TOP-K parameter, and an MSE error. For the correct comparison the images are scaled to the same size first,and then compared per coordinate. Args: test_image: obtained image to obtain the metrics true_image: original image to compare against the test_image. top_k: parameter to compute top-k hot spots. total_size: the size to scale the images to. Returns: l2 dist, hot spot counts, movers distance, f1-score, l1 dist, mutual info, MSE. """ # normalize the input images test_image = normalize(rescale_image(test_image, total_size)) true_image = normalize(rescale_image(true_image, total_size)) top_k_test, top_k_test_arr = largest_indices(test_image, top_k) top_k_true, top_k_true_arr = largest_indices(true_image, top_k) l1_distance = np.linalg.norm(true_image - test_image, ord=1) l2_distance = np.linalg.norm(true_image - test_image, ord=2) mse = mt.mean_squared_error(test_image, true_image) top_k_diff = len(top_k_true.intersection(top_k_test)) wasserstein = stats.wasserstein_distance(test_image.reshape(-1), true_image.reshape(-1)) f1 = mt.f1_score(top_k_true_arr.reshape(-1), top_k_test_arr.reshape(-1)) mutual = mt.mutual_info_score(true_image.reshape(-1), test_image.reshape(-1)) metrics = Metrics(l1_distance=l1_distance, l2_distance=l2_distance, mse=mse, f1=f1, wasserstein=wasserstein, hotspots_count=top_k_diff, mutual_info=mutual) return metrics
def variation_of_information(lab1, lab2): import sklearn.metrics as mt import numpy as np def entropy(lab): res = 0 from collections import Counter repetitions = Counter(lab) for cluster in repetitions.keys(): p = repetitions[cluster] / len(lab) res += -p * np.log(p) return res return entropy(lab1) + entropy(lab2) - 2 * mt.mutual_info_score(lab1, lab2)
def mutual_information(x, y, nbins=32, normalized=False): """ Compute mutual information :param x: 1D numpy.array : flatten data from an image :param y: 1D numpy.array : flatten data from an image :param nbins: number of bins to compute the contingency matrix (only used if normalized=False) :return: float non negative value : mutual information """ from sklearn.metrics import normalized_mutual_info_score, mutual_info_score if normalized: mi = normalized_mutual_info_score(x, y) else: c_xy = np.histogram2d(x, y, nbins)[0] mi = mutual_info_score(None, None, contingency=c_xy) # mi = adjusted_mutual_info_score(None, None, contingency=c_xy) return mi
def calc_mutual_information(x, y, bins): try: if bins == -1: bins = doane_bin(x) if bins == np.inf: bins = sturges_bin(x) except ValueError: bins = 10.0 # print "bins", bins try: c_xy = np.histogram2d(x, y, bins)[0] mi = metrics.mutual_info_score(None, None, contingency=c_xy) # print "success" except Exception,e: print "error with mi calc", str(e) mi = 0
def build_chow_liu_tree(df, abs_weight=True): """ Build a Chow-Liu tree from the data, X. n is the number of features. The weight on each edge is the negative of the mutual information between those features. The tree is returned as a networkx object. """ G = nx.Graph() for u in df.columns: G.add_node(u) for v in df.columns: G.add_edge(u, v, weight=-mutual_info_score(df[u], df[v])) T = nx.minimum_spanning_tree(G) if abs_weight: for u, v, d in T.edges(data=True): T[u][v]['weight'] = abs(d['weight']) return T
def calc_MI(x, y, bins): """ Calculate the mutual information between two distributions. Args: x (float array): true distribution y (float array): predicted distribution bins (int): number of bins to use in building a histogram of x and y Returns: float: mutual information between x and y """ c_xy = np.histogram2d(x, y, bins)[0] mi = mutual_info_score(None, None, contingency=c_xy) return mi
def test_mutualInformation(): # This test verifies that the getMutualInfo() function returns correct values, comparing them with the ones # by the sklearn library cardinalities = [2, 2, 2, 3] population = [[1, 0, 1, 0], [1, 0, 1, 0], [1, 0, 1, 0], [0, 1, 1, 1], [0, 1, 0, 1], [1, 1, 1, 1], [0, 0, 1, 0], [1, 1, 1, 1]] ad.init(0.0, len(population[0]), population, cardinalities) ad.addFrequenciestoMatrix(population) print("Mutual information with our algorithm:" + str(ad.getMutualInfo(1, 2))) print("Mutual information with SKLearn method:" + str( mutual_info_score( np.transpose(population)[1], np.transpose(population)[2]))) print("\n")
def mutual_information(labels_true, labels_pred): """Mutual information of distributions in format of pd.Series or pd.DataFrame. Args: labels_true: Series or DataFrame labels_pred: Series or DataFrame """ if isinstance(labels_true, pd.DataFrame): labels_true = labels_true.astype(str).apply(lambda x: ' '.join(x.tolist()), axis=1) if isinstance(labels_pred, pd.DataFrame): labels_pred = labels_pred.astype(str).apply(lambda x: ' '.join(x.tolist()), axis=1) assert isinstance(labels_true, pd.Series) assert isinstance(labels_pred, pd.Series) return mutual_info_score(labels_true.astype(str), labels_pred.astype(str))
def image_compare_mutualInfo(img1, img2): ''' mutual info similarity :param img1: image path :param img2: image path :return: mutual info similarity value ''' img1 = imread(img1) img2 = imread(img2) img2 = np.resize(img2, (img1.shape[0], img1.shape[1], img1.shape[2])) img1 = np.reshape(img1, -1) img2 = np.reshape(img2, -1) mutual_infor = mr.mutual_info_score(img1, img2) return mutual_infor
def parse_seq(filename): seqs=utils.read_file(filename) X=[seq_i.split("#")[0] for seq_i in seqs] X=[count(x_i) for x_i in X] #X=[to_ngram(x_i) for x_i in X] y=[seq_i.split("#")[1] for seq_i in seqs] y=[int(y_i) for y_i in y] n_cats,cat_names=find_n_cats(X) np_hist=create_histogram(X,n_cats,cat_names) indicators=[get_indicator(i,np_hist) for i in range(0,n_cats)] cats=[get_category(i,y) for i in range(0,n_cats)] entropy_matrix=np.zeros((len(cats),len(indicators))) for i,cat_i in enumerate(cats): for j,indic_j in enumerate(indicators): entropy_matrix[i][j]=metrics.mutual_info_score(cat_i,indic_j) print(entropy_matrix)
def sort_pivots(dict_words, data, labels): print("Sorting potential pivots...") sorted_pivots = [] num_features = data.shape[1] info_scores = [] for i in range(num_features): info = mutual_info_score(data[:, i], labels) info_scores.append(info) info_scores_sorted = sorted(range(len(info_scores)), key=lambda i: info_scores[i], reverse=True) for i in range(num_features): sorted_pivots.append( dict_words.get_feature_names()[info_scores_sorted[i]]) return sorted_pivots
def main(): args = parse_args () my_data = np.genfromtxt(args.file, delimiter=',', skip_header=1, usecols=(2,3,4,5,6,7), dtype=int) data_true = np.genfromtxt(args.file, delimiter=',', skip_header=1, usecols=(1), dtype=int) #mean and std diviation for lenghts mean_l, std_div_l = min_max_lenght (my_data) lables = DBSCAN (my_data, args.minPts, args.eps) eps = args.eps minPts = args.minPts max_noise_level = 0.5 lables_without_noise = lables [lables != -1] my_data_without_noise = my_data [lables != -1] silhouette = silhouette_score (my_data_without_noise, lables_without_noise) #trying to find best fitting eps and minPts, using silhouette score for eps_cur in np.linspace (mean_l - 2*std_div_l, mean_l + 2*std_div_l, args.num_stad): for minPts_cur in np.arange (args.min, args.max, 1): cur_lables = DBSCAN (my_data, minPts_cur, eps_cur) my_data_without_noise = my_data [cur_lables != -1] lables_without_noise = cur_lables [cur_lables != -1] cur_silhouette = silhouette_score (my_data_without_noise, lables_without_noise) if cur_silhouette > silhouette and 1.0*(cur_lables.size - lables_without_noise.size)/cur_lables.size < max_noise_level : lables = cur_lables silhouette = cur_silhouette eps = eps_cur minPts = minPts_cur #our data without noise lables_without_noise = lables [lables != -1] my_data_without_noise = my_data [lables != -1] data_true_without_noise = data_true [lables != -1] print "eps = ", eps, ", minPts = ", minPts print "silhouette = ", silhouette print "number of noise = ", 1.0 *(lables.size - lables_without_noise.size)/lables.size print "number of clasters = ", np.unique (lables_without_noise).size print "purity = ", purity (lables_without_noise, data_true_without_noise) print "rand index = ", rand_index (lables_without_noise, data_true_without_noise) print "mutual info = ", mutual_info_score (lables_without_noise, data_true_without_noise) pass
def normalized_mutual_information_skl(self, fa, fb, bins=10): # NMI(A,B) := I(A;B) / H(A,B) if self.project(fa).entropy() == 0.0: return 0.0 if self.project(fb).entropy() == 0.0: return 0.0 # floating point issues distAB = self.project(lambda x: (fa(x), fb(x))) dbA = [val[0] for (val, prob) in distAB] dbB = [val[1] for (val, prob) in distAB] prob = [prob for (val, prob) in distAB] from sklearn.metrics import mutual_info_score c_xy = np.histogram2d(dbA, dbB, bins, weights=prob)[0] mi = mutual_info_score(None, None, contingency=c_xy) return mi
def compute_mutual_information(prediction, target): """Calculates mutual information between target and prediction Parameters ---------- prediction : torch.Tensor Predicted image target : torch.Tensor Target image """ from sklearn.metrics import mutual_info_score p_xy, _, _ = np.histogram2d(prediction.data.cpu().numpy().flatten(), target.data.cpu().numpy().flatten(), bins=256, range=((0, 1), (0, 1)), normed=True) return mutual_info_score(None, None, contingency=p_xy)
def mutual_information(self, position1, position2, family=False, normalized=False, adjusted=False): labels_position1 = self.data_row(position1, family) labels_position2 = self.data_row(position2, family) if normalized == True: MI = metrics.normalized_mutual_info_score(labels_position1, labels_position2) elif adjusted == True: MI = metrics.adjusted_mutual_info_score(labels_position1, labels_position2) else: MI = metrics.mutual_info_score(labels_position1, labels_position2) return MI
def compute_MI(x, y, bins): """ Compute mutual information between two vectors given custom bins. Parameters ---------- x, y : array, 1D Signals to compute mutual information between. bins : integer or array, 1D Number of bins (if integer) or bin edges (if array) for 2D histogram. Returns ------- MI : float Mutual information estimate. """ c_xy = np.histogram2d(x, y, bins)[0] return mutual_info_score(None, None, contingency=c_xy)
def evaluatePerSample(self, sampleNo): AMI = metrics.adjusted_mutual_info_score(self.labelsTrue, self.labelsPred) NMI = metrics.normalized_mutual_info_score(self.labelsTrue, self.labelsPred) MI = metrics.mutual_info_score(self.labelsTrue, self.labelsPred) ARI = metrics.adjusted_rand_score(self.labelsTrue, self.labelsPred) homogeneity = metrics.homogeneity_score(self.labelsTrue, self.labelsPred) completeness = metrics.completeness_score(self.labelsTrue, self.labelsPred) V = metrics.v_measure_score(self.labelsTrue, self.labelsPred) # SC = metrics.silhouette_score(self.X, self.labelsPred, metric='sqeuclidean') #Silhouette Coefficient self.AMIList.append(AMI) self.NMIList.append(NMI) self.MIList.append(MI) self.ARIList.append(ARI) self.homogeneityList.append(homogeneity) self.completenessList.append(completeness) self.VList.append(V)
def get_clustering_scoring(y, y_pred): scoring = {} if y is None: return scoring try: scoring['adjusted_mutual_info_score'] = \ metrics.adjusted_mutual_info_score(y, y_pred) except Exception: pass try: scoring['adjusted_rand_score'] = \ metrics.adjusted_rand_score(y, y_pred) except Exception: pass try: scoring['completeness_score'] = \ metrics.completeness_score(y, y_pred) except Exception: pass try: scoring['fowlkes_mallows_score'] = \ metrics.fowlkes_mallows_score(y, y_pred) except Exception: pass try: scoring['homogeneity_score'] = \ metrics.homogeneity_score(y, y_pred) except Exception: pass try: scoring['mutual_info_score'] = \ metrics.mutual_info_score(y, y_pred) except Exception: pass try: scoring['normalized_mutual_info_score'] = \ metrics.normalized_mutual_info_score(y, y_pred) except Exception: pass try: scoring['v_measure_score'] = \ metrics.v_measure_score(y, y_pred) except Exception: pass return scoring
def __init__(self, X, Y, method="ICAP"): """ This class provides easy access to mutual information based filter feature selection. The default mutual information estimation algorithm used is the histogram binning method. If a more sophisticated approach is required, use the change_MI_estimator function to apply your own method. :param X: (n_samples, n_features) numpy array containing the training data :param Y: (n_samples) numpy array containing target labels :param method: filter criterion that will be applied to select the features. Available criteria are: (as string) "CIFE" [Lin1996], "ICAP" [Jakulin2005], "CMIM" [Fleuret2004], "JMI"[Yang1999] """ if X.shape[0] != len(Y): raise ValueError("X must have as many samples as there are labels in Y") self._n_features = X.shape[1] def normalize_data_for_MI(X): for i in range(X.shape[1]): std = X[:, i].std() if std != 0.: X[:, i] /= std X[:, i] -= X[:, i].min() return np.floor(X).astype("int") self._X = normalize_data_for_MI(np.asarray(X)) self._Y = np.asarray(Y) self._method_str = method self._methods = { "CIFE": self.__J_CIFE, "ICAP": self.__J_ICAP, "CMIM": self.__J_CMIM, "JMI": self.__J_JMI, "mRMR": self.__J_mRMR, "MIFS": self.__J_MIFS } self._filter_criterion_kwargs = {} self.change_method(method) self._method = self._methods[method] self._mutual_information_estimator = lambda X1, X2: mutual_info_score(X1,X2)/np.log(2.0) self._redundancy = np.zeros((self._n_features, self._n_features)) - 1. self._relevancy = np.zeros((self._n_features)) - 1 self._class_cond_red = np.zeros((self._n_features, self._n_features)) - 1 self._class_cond_mi_method = self._calculate_class_conditional_MI
def mutual_information(self, max_lag): """ Uses numpy's mutual information """ digi = utilities.mi_digitize(self.X) mi = np.empty(max_lag) for i in range(max_lag): ind = i + 1 unshift = digi[ind:] shift = digi[0:-ind] mi[i] = skmetrics.mutual_info_score(unshift, shift) return mi
def algorithm(X_train, X_test, y_train, y_test): clf_dt = tree.DecisionTreeClassifier() clf_dt.fit(X_train, y_train) predicted_y = clf_dt.predict(X_test) clf_dt.score(X_test, y_test) mutual_score = mutual_info_score(y_test, predicted_y) roc_score2 = roc_auc_score(y_test, predicted_y) X_train['Unnamed: 5'] = X_train['Unnamed: 5'].astype(float) X_train['Unnamed: 6'] = X_train['Unnamed: 6'].astype(float) X_test['Unnamed: 5'] = X_test['Unnamed: 5'].astype(float) X_test['Unnamed: 6'] = X_test['Unnamed: 6'].astype(float) print('mutual score = ', mutual_score) print('roc_auc_score = ', roc_score2) return mutual_score, roc_score2
def _mutual_info_compute(when, then, df): """ Computes the metric results. :param when: :type when: str/int :param then: :type then: str/int :param df: :type df: DataFrame :return: Result of the metric. """ index = (df[when].isna()) | (df[then].isna()) index = ~index if sum(index) > 0: return mutual_info_score(df[when][index], df[then][index]) else: return 0
def find_tau(data): data = data.reshape((len(data), 1)) kbins = KBinsDiscretizer(n_bins=1000, encode='ordinal', strategy='uniform') data_trans = kbins.fit_transform(data) # find usable time delay via mutual information tau_max = 50 mis = [] data = data_trans.reshape(len(data_trans, )) for tau in range(1, tau_max): unlagged = data[:-tau] lagged = np.roll(data, -tau)[:-tau] mis.append(metrics.mutual_info_score(lagged, unlagged)) if len(mis) > 1 and mis[-2] < mis[-1]: # return first local minima tau -= 1 return tau
def select_using_MI(features, labels, threshold=0.01, ham_label=-1): ''' Returns indices of the most salient features for the ham class, using a mutual information score between feature values and class label, and from the highest scoring, filtering the ones that are most present in spam relatively. This makes sense since we then use these indices to choose which features to turn on in emails TODO or I could keep all the highest MI score features, and if more present in ham, I set it to 1, else I set it to zero requires extra array somewhere though Thinking about it, this is already what is happening since the ham malicious instances already have all their features set to zero (but this proportion of features controlled by the attacker could vary ? depending on attacket's dataset knowledge ? ie. malicious instances' feature values could be initialised randomly, or drawn from a spammy distribution to mimick an email that still has a malicious potential (although this isn't necessary since we are doing a poison, not evasion attack); then compare this to initialised with 0s or with 1s ''' X, Y = features, np.ravel(labels) N, D = X.shape d = int(D * threshold) ## keep top d features with highest score tls.logger.info('Keep top %s features' % d) ## calculate frequency of feature presence relative to each class ham_freq = np.mean(X[np.ravel(Y == ham_label)], axis=0) spam_freq = np.mean(X[np.ravel(Y != ham_label)], axis=0) tls.logger.debug('- feature frequency in ham class: %s' % ham_freq) tls.logger.debug('- feature frequency in spam class: %s' % spam_freq) ## calculate mutual information between features and labels MI_per_feature = (mutual_info_score(X[:, f], Y) for f in range(D)) MI_per_feature = np.fromiter(MI_per_feature, dtype=np.float16) tls.logger.debug('- mutual information scores: %s' % MI_per_feature) ## keep only salient features for ham (according to relative presence in that class) MI_per_feature[ham_freq < spam_freq] = 0 salient_indices = np.argpartition(MI_per_feature, -d)[-d:] ## ^ https://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array/20177786#20177786 return salient_indices
def calc_MI(x, y, bins=[25,25], maxvalues=41): "Calculates the mutual information between two variables." # maxvalues determines the cutoff between real and discrete # for discrete information it makes sense to use the number of unique values as the number of bins # if there are only two classes we want to use two bins not say 25 bins num_unique_values_x = pd.value_counts(x).size num_unique_values_y = pd.value_counts(y).size if num_unique_values_x < maxvalues: bins[0] = num_unique_values_x if num_unique_values_y < maxvalues: bins[1] = num_unique_values_y # create the 2d histogram needed to calculate the mutual information via sklearn # [0] gets the histogram, [1] & [2] are binning info. c_xy = np.histogram2d(x, y, bins=bins)[0] # use scikit learn to calculate the mi value mi = mutual_info_score(None, None, contingency=c_xy) return mi