def test_cluster_threshold(self): """Expected error for threshold to similarity should be reasonable""" n_tests = 50 dim = 15 expected_error = 0.20 tot_err = 0 for test in range(n_tests): # Get some sets and their similarities sets = (randset(), randset()) jsim = jaccard_sim(*sets) # Find the threshold at which they cluster together for threshold in range(1, 100, 5): threshold = float(threshold) / 100 bandwidth = get_bandwidth(dim, threshold) num_bands = int(dim / bandwidth) cluster = Cluster(width=num_bands * bandwidth, bandwidth=bandwidth) cluster.add_set(sets[0]) cluster.add_set(sets[1]) if len(cluster.get_clusters()) == 2: tot_err += abs(jsim - threshold) break avg_err = float(tot_err) / n_tests self.assertLessEqual(avg_err, expected_error, "Error too large")
def test_cluster_threshold(self): """Expected error for threshold to similarity should be reasonable""" n_tests = 50 dim = 15 expected_error = 0.20 tot_err = 0 for test in range(n_tests): # Get some sets and their similarities sets = (randset(), randset()) jsim = jaccard_sim(*sets) # Find the threshold at which they cluster together for threshold in range(1, 100, 5): threshold = float(threshold) / 100 bandwidth = get_bandwidth(dim, threshold) num_bands = int(dim / bandwidth) cluster = Cluster(width=num_bands * bandwidth, bandwidth=bandwidth) cluster.add_item(sets[0]) cluster.add_item(sets[1]) if len(cluster.get_clusters()) == 2: tot_err += abs(jsim - threshold) break avg_err = float(tot_err) / n_tests self.assertLessEqual(avg_err, expected_error, "Error too large")
def test_signature_similarity(self): """The probability that two sets' signatures match at some index are equal is equal to the Jaccard similarity between the two""" n_tests = 100 expected_error = 1.0 / 10 # Expected error is O(1/sqrt(dim)) mh = MinHashSignature(10 * 10) err = 0.0 for test in range(n_tests): # Create random sets and their signatures sets = (randset(), randset()) sigs = map(mh.get_signature, sets) # Calculate true Jaccard similarity, and sim of signatures jsim = jaccard_sim(*sets) ssim = sigsim(*sigs, dim=100) # Accumulate error err += abs(jsim - ssim) # Over n_tests large, we should be within upper bound of expected error avg_err = err / n_tests self.assertGreaterEqual( expected_error, avg_err, msg="Accuracy test failed. (avg error: %f)" % avg_err)
def test_signature_similarity(self): """The probability that two sets' signatures match at some index are equal is equal to the Jaccard similarity between the two """ n_tests = 100 expected_error = 1.0 / 10 # Expected error is O(1/sqrt(dim)) mh = MinHashSignature(10 * 10) err = 0.0 for _ in xrange(n_tests): # Create random sets and their signatures sets = (randset(), randset()) sigs = map(mh.get_signature, sets) # Calculate true Jaccard similarity, and sim of signatures jsim = jaccard_sim(*sets) ssim = sigsim(*sigs, dim=100) # Accumulate error err += abs(jsim - ssim) # Over n_tests large, we should be within upper bound of expected error avg_err = err / n_tests self.assertGreaterEqual(expected_error, avg_err, msg="Accuracy test failed. (avg error: %f)" % avg_err)
def test_same_set(self): """A set should be clustered with itself""" s = randset() cluster = Cluster(width=10, bandwidth=2) cluster.add_set(s) cluster.add_set(s) self.assertEqual(len(cluster.get_clusters()), 1)
def test_same_set(self): """A set should be clustered with itself""" s = randset() cluster = Cluster(width=10, bandwidth=2) cluster.add_item(s) cluster.add_item(s) self.assertEqual(len(cluster.get_clusters()), 1)
def test_consistent_signature(self): """Signatures should be consistent""" mh = MinHashSignature(10 * 10) s = randset() self.assertEqual(mh.get_signature(s), mh.get_signature(s))
def test_signature_length(self): """Signatures should have correct dimension""" mh = MinHashSignature(10 * 10) self.assertEqual(100, len(mh.get_signature(randset())))