Esempio n. 1
0
 def process(self, documents):
     for document in documents:
         if (isinstance(document, Document)):
             ngrams = document.to_ngrams(self.ngram_size)
         elif type(document) == str:
             doc = Document(document)
             ngrams = doc.to_ngrams(self.ngram_size)
         uniq = sets.ImmutableSet(ngrams)
         hashes = [hash(ng) for ng in uniq]
         if self.attribute:
             document.set(self.attribute, hashes)
             yield document
         else:
             yield hashes
Esempio n. 2
0
class DocumentTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "This this."})
        self.d2 = Document({"id": "2", "body": "This is another test document."})
        self.d3 = Document({"id": "3", "body": "Two words."})
        self.d4 = Document({"id": "3", "body": "Derp a derp.\nA derp derp."})

    def test_init(self):
        d1 = Document({"id": "1", "body": "This this."})
        self.assertEqual(d1.doc_id, "1")
        self.assertEqual(str(d1), "This this.")
        d2 = Document("This this.")
        self.assertEqual(str(d1), "This this.")

    def test___str__(self):
        self.assertEqual(str(self.d1), "This this.")
        self.assertEqual(str(self.d2), "This is another test document.")
        self.assertEqual(str(self.d3), "Two words.")

    def test_update_text(self):
        self.assertEqual(str(self.d1), "This this.")
        self.d1.update_text("more words")
        self.assertEqual(str(self.d1), "more words")

    def test_words(self):
        words = list(self.d4.words())
        self.assertEqual(len(words), 8)
        self.assertEqual(words[0], "derp")
        self.assertEqual(words[1], "a")
        self.assertEqual(words[2], "derp")
        self.assertEqual(words[3], ".")
        self.assertEqual(words[4], "a")
        self.assertEqual(words[5], "derp")
        self.assertEqual(words[6], "derp")
        self.assertEqual(words[7], ".")

    def test_to_ngrams(self):
        ngrams = self.d4.to_ngrams(2)
        self.assertEqual(ngrams, [("derp", "a"), ("a", "derp"), ("derp", "."),
                                  (".", "a"), ("a", "derp"), ("derp", "derp"),
                                  ("derp", ".")])