def adjustedMutualInformation(cluster1, cluster2): ''' Using the scikit-learn algorithms, calculate the adjusted mutual information for two clusterings. Assume cluster1 is the reference/ground truth clustering. The adjusted MI accounts for higher scores by chance, particularly in the case where a larger number of clusters leads to a higher MI. AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [max(H(U), H(V)) - E(MI(U, V))] where E(MI(U, V)) is the expected mutual information given the number of clusters and H(U), H(V) are the entropies of clusterings U and V. ''' cont = contingency(cluster1, cluster2) mi = mutualInformation(cluster1, cluster2) sample_size = float(sum([len(cluster1[x]) for x in cluster1.keys()])) # Given the number of samples, what is the expected number # of overlaps that would occur by chance? emi = supervised.expected_mutual_information(cont, sample_size) # calculate the entropy for each clustering h_clust1, h_clust2 = entropy(cluster1), entropy(cluster2) if abs(h_clust1) == 0.0: h_clust1 = 0.0 else: pass if abs(h_clust2) == 0.0: h_clust2 = 0.0 else: pass ami = (mi - emi) / (max(h_clust1, h_clust2) - emi) # bug: entropy will return -0 in some instances # make sure this is actually 0 else ami will return None # instead of 0.0 if np.isnan(ami): ami = np.nan_to_num(ami) else: pass return ami
def _ami(ab_cts, average_method='arithmetic'): """Adjusted mutual information between two discrete categorical random variables based on counts observed and provided in ab_cts. Code adapted directly from scikit learn AMI to accomodate having counts/contingency table instead of rows/instances: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_mutual_info_score.html Parameters ---------- ab_cts : np.ndarray [len(a_classes) x len(b_classes) Counts for each combination of classes in random variables a and b organized in a rectangular array. average_method : str See sklearn documentation for details Returns ------- ami : float Adjusted mutual information score for variables a and b""" a_freq = np.sum(ab_cts, axis=1) a_freq = a_freq / np.sum(a_freq) b_freq = np.sum(ab_cts, axis=0) b_freq = b_freq / np.sum(b_freq) n_samples = np.sum(ab_cts) """ Calculate the MI for the two clusterings contingency is a joint count distribution [a_classes x b_classes]""" mi = mutual_info_score(None, None, contingency=ab_cts) """Calculate the expected value for the mutual information""" emi = expected_mutual_information(ab_cts, n_samples) """Calculate entropy""" h_true, h_pred = _entropy(a_freq), _entropy(b_freq) normalizer = _generalized_average(h_true, h_pred, average_method) denominator = normalizer - emi if denominator < 0: denominator = min(denominator, -np.finfo('float64').eps) else: denominator = max(denominator, np.finfo('float64').eps) ami = (mi - emi) / denominator return ami