Esempio n. 1
0
    def test_cluster_threshold(self):
        """Expected error for threshold to similarity should be reasonable"""
        n_tests = 50
        dim = 15
        expected_error = 0.20

        tot_err = 0
        for test in range(n_tests):
            # Get some sets and their similarities
            sets = (randset(), randset())
            jsim = jaccard_sim(*sets)

            # Find the threshold at which they cluster together
            for threshold in range(1, 100, 5):
                threshold = float(threshold) / 100
                bandwidth = get_bandwidth(dim, threshold)
                num_bands = int(dim / bandwidth)
                cluster = Cluster(width=num_bands * bandwidth,
                                  bandwidth=bandwidth)
                cluster.add_set(sets[0])
                cluster.add_set(sets[1])
                if len(cluster.get_clusters()) == 2:
                    tot_err += abs(jsim - threshold)
                    break
        avg_err = float(tot_err) / n_tests
        self.assertLessEqual(avg_err, expected_error, "Error too large")
Esempio n. 2
0
    def test_cluster_threshold(self):
        """Expected error for threshold to similarity should be reasonable"""
        n_tests = 50
        dim = 15
        expected_error = 0.20

        tot_err = 0
        for test in range(n_tests):
            # Get some sets and their similarities
            sets = (randset(), randset())
            jsim = jaccard_sim(*sets)

            # Find the threshold at which they cluster together
            for threshold in range(1, 100, 5):
                threshold = float(threshold) / 100
                bandwidth = get_bandwidth(dim, threshold)
                num_bands = int(dim / bandwidth)
                cluster = Cluster(width=num_bands * bandwidth,
                                  bandwidth=bandwidth)
                cluster.add_item(sets[0])
                cluster.add_item(sets[1])
                if len(cluster.get_clusters()) == 2:
                    tot_err += abs(jsim - threshold)
                    break
        avg_err = float(tot_err) / n_tests
        self.assertLessEqual(avg_err, expected_error, "Error too large")
Esempio n. 3
0
    def test_signature_similarity(self):
        """The probability that two sets' signatures match at some index
        are equal is equal to the Jaccard similarity between the two"""
        n_tests = 100
        expected_error = 1.0 / 10  # Expected error is O(1/sqrt(dim))
        mh = MinHashSignature(10 * 10)
        err = 0.0

        for test in range(n_tests):
            # Create random sets and their signatures
            sets = (randset(), randset())
            sigs = map(mh.get_signature, sets)

            # Calculate true Jaccard similarity, and sim of signatures
            jsim = jaccard_sim(*sets)
            ssim = sigsim(*sigs, dim=100)

            # Accumulate error
            err += abs(jsim - ssim)

        # Over n_tests large, we should be within upper bound of expected error
        avg_err = err / n_tests
        self.assertGreaterEqual(
            expected_error,
            avg_err,
            msg="Accuracy test failed. (avg error: %f)" % avg_err)
Esempio n. 4
0
    def test_signature_similarity(self):
        """The probability that two sets' signatures match at some index are
        equal is equal to the Jaccard similarity between the two
        """
        n_tests = 100
        expected_error = 1.0 / 10  # Expected error is O(1/sqrt(dim))
        mh = MinHashSignature(10 * 10)
        err = 0.0

        for _ in xrange(n_tests):
            # Create random sets and their signatures
            sets = (randset(), randset())
            sigs = map(mh.get_signature, sets)

            # Calculate true Jaccard similarity, and sim of signatures
            jsim = jaccard_sim(*sets)
            ssim = sigsim(*sigs, dim=100)

            # Accumulate error
            err += abs(jsim - ssim)

        # Over n_tests large, we should be within upper bound of expected error
        avg_err = err / n_tests
        self.assertGreaterEqual(expected_error,
                                avg_err,
                                msg="Accuracy test failed. (avg error: %f)" %
                                avg_err)
Esempio n. 5
0
 def test_same_set(self):
     """A set should be clustered with itself"""
     s = randset()
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_set(s)
     cluster.add_set(s)
     self.assertEqual(len(cluster.get_clusters()), 1)
Esempio n. 6
0
 def test_same_set(self):
     """A set should be clustered with itself"""
     s = randset()
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_item(s)
     cluster.add_item(s)
     self.assertEqual(len(cluster.get_clusters()), 1)
Esempio n. 7
0
 def test_consistent_signature(self):
     """Signatures should be consistent"""
     mh = MinHashSignature(10 * 10)
     s = randset()
     self.assertEqual(mh.get_signature(s), mh.get_signature(s))
Esempio n. 8
0
 def test_signature_length(self):
     """Signatures should have correct dimension"""
     mh = MinHashSignature(10 * 10)
     self.assertEqual(100, len(mh.get_signature(randset())))
Esempio n. 9
0
 def test_consistent_signature(self):
     """Signatures should be consistent"""
     mh = MinHashSignature(10 * 10)
     s = randset()
     self.assertEqual(mh.get_signature(s), mh.get_signature(s))
Esempio n. 10
0
 def test_signature_length(self):
     """Signatures should have correct dimension"""
     mh = MinHashSignature(10 * 10)
     self.assertEqual(100, len(mh.get_signature(randset())))