def test_trigram(self):
        unigrams = NGramSpace(3)

        x = unigrams.parse("This is a sentence")
        y = unigrams.parse("This is another sentence")

        self.assertEqual([1, 2], x)
        self.assertEqual([3, 4], y)

        self.assertEqual(0, overlap(x, y))
        self.assertEqual(0, overlap(y, x))

        self.assertEqual(0, jaccard(x, y))
        self.assertEqual(0, jaccard(y, x))
    def test_bigram(self):
        unigrams = NGramSpace(2)

        x = unigrams.parse("This is a sentence")
        y = unigrams.parse("This is another sentence")

        self.assertEqual([1, 2, 3], x)
        self.assertEqual([1, 4, 5], y)

        self.assertEqual(1, overlap(x, y))
        self.assertEqual(1, overlap(y, x))

        self.assertEqual(1.0 / 5.0, jaccard(x, y))
        self.assertEqual(1.0 / 5.0, jaccard(y, x))
    def test_trigram(self):
        unigrams = NGramSpace(3)

        x = unigrams.parse('This is a sentence')
        y = unigrams.parse('This is another sentence')

        self.assertEqual([1, 2], x)
        self.assertEqual([3, 4], y)

        self.assertEqual(0, overlap(x, y))
        self.assertEqual(0, overlap(y, x))

        self.assertEqual(0, jaccard(x, y))
        self.assertEqual(0, jaccard(y, x))
    def test_bigram(self):
        unigrams = NGramSpace(2)

        x = unigrams.parse('This is a sentence')
        y = unigrams.parse('This is another sentence')

        self.assertEqual([1, 2, 3], x)
        self.assertEqual([1, 4, 5], y)

        self.assertEqual(1, overlap(x, y))
        self.assertEqual(1, overlap(y, x))

        self.assertEqual(1.0 / 5.0, jaccard(x, y))
        self.assertEqual(1.0 / 5.0, jaccard(y, x))
 def __init__(self, docs):
     self.num_docs = len(docs)
     self.assignments = range(0, self.num_docs)
     
     self.distance = SymmetricMatrix(self.num_docs)
     count = 0
     for i in range(0, self.num_docs):
         for j in range(0, i + 1):
             self.distance[i, j] = 1.0 - jaccard(docs[i], docs[j])
             
             count += 1
             if count % 1000000 == 0:
                 print "Computed %d distances out of %d..." % (count, self.num_docs * self.num_docs / 2)
     
     
     for i in range(0, self.num_docs):
         for j in range(0, i):
             if self.distance[i, j] == 0 and self.assignments[i] != self.assignments[j]:
                 self.merge(i, j)
    def __init__(self, docs):
        self.num_docs = len(docs)
        self.assignments = range(0, self.num_docs)

        self.distance = SymmetricMatrix(self.num_docs)
        count = 0
        for i in range(0, self.num_docs):
            for j in range(0, i + 1):
                self.distance[i, j] = 1.0 - jaccard(docs[i], docs[j])

                count += 1
                if count % 1000000 == 0:
                    print "Computed %d distances out of %d..." % (
                        count, self.num_docs * self.num_docs / 2)

        for i in range(0, self.num_docs):
            for j in range(0, i):
                if self.distance[
                        i,
                        j] == 0 and self.assignments[i] != self.assignments[j]:
                    self.merge(i, j)