def check_sample_int_distribution(sample_without_replacement): # This test is heavily inspired from test_random.py of python-core. # # For the entire allowable range of 0 <= k <= N, validate that # sample generates all possible permutations n_population = 10 # a large number of trials prevents false negatives without slowing normal # case n_trials = 10000 for n_samples in range(n_population): # Counting the number of combinations is not as good as counting the # the number of permutations. However, it works with sampling algorithm # that does not provide a random permutation of the subset of integer. n_expected = comb(n_population, n_samples, exact=True) output = {} for i in range(n_trials): output[frozenset( sample_without_replacement(n_population, n_samples))] = None if len(output) == n_expected: break else: raise AssertionError( "number of combinations != number of expected (%s != %s)" % (len(output), n_expected))
def check_sample_int_distribution(sample_without_replacement): # This test is heavily inspired from test_random.py of python-core. # # For the entire allowable range of 0 <= k <= N, validate that # sample generates all possible permutations n_population = 10 # a large number of trials prevents false negatives without slowing normal # case n_trials = 10000 for n_samples in range(n_population): # Counting the number of combinations is not as good as counting the # the number of permutations. However, it works with sampling algorithm # that does not provide a random permutation of the subset of integer. n_expected = comb(n_population, n_samples, exact=True) output = {} for i in range(n_trials): output[frozenset(sample_without_replacement(n_population, n_samples))] = None if len(output) == n_expected: break else: raise AssertionError( "number of combinations != number of expected (%s != %s)" % (len(output), n_expected))
def rand_index_score(labels_true, labels_pred): labels_true, labels_pred = check_clusterings(labels_true, labels_pred) n_samples = labels_true.shape[0] n_classes = np.unique(labels_true).shape[0] n_clusters = np.unique(labels_pred).shape[0] # Special limit cases: no clustering since the data is not split; # or trivial clustering where each document is assigned a unique cluster. # These are perfect matches hence return 1.0. if (n_classes == n_clusters == 1 or n_classes == n_clusters == 0 or n_classes == n_clusters == n_samples): return 1.0 # Compute the RI using the contingency data contingency = contingency_matrix(labels_true, labels_pred) n = np.sum(np.sum(contingency)) t1 = comb(n, 2) t2 = np.sum(np.sum(np.power(contingency, 2))) nis = np.sum(np.power(np.sum(contingency, 0), 2)) njs = np.sum(np.power(np.sum(contingency, 1), 2)) t3 = 0.5 * (nis + njs) A = t1 + t2 - t3 nc = (n * (n**2 + 1) - (n + 1) * nis - (n + 1) * njs + 2 * (nis * njs) / n) / (2 * (n - 1)) AR = (A - nc) / (t1 - nc) return A / t1
def binomial(k, n): return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True)
def _comb2(n): # the exact version is faster for k == 2: use it by default globally in # this module instead of the float approximate variant return comb(n, 2, exact=1)