Exemple #1
0
class TeeTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1",
                            "body": "A test of a unique filter pipeline."})

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 8)
        self.assertEqual(words[1], "test")

    def document_test(self, words):
        self.assertEqual(len(words), 8)
        self.assertEqual(words[0], "a")
        self.assertEqual(words[1], "test")
        self.assertEqual(words[2], "of")
        self.assertEqual(words[3], "a")

    def test_tee(self):
        identity = Identity()
        t = Tee(identity)
        words = list(t.process(self.d1.words()))
        self.document_test(words)
        words2 = list(t.alternate())
        self.document_test(words2)
class HtmlCleanerTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1",
                            "body": "A test of a html cleaner pipeline.<br/> Another sentence."})
        self.docs = [self.d1]

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 13)
        self.assertEqual(words[1], "test")

    def test_html_cleaner_docs(self):
        r = HtmlCleaner()
        docs = list(r.process(self.docs))
        words = list(docs[0].words())
        self.assertEqual(len(words), 11)
        self.assertEqual(words[6], "pipeline")
        self.assertEqual(words[7], ".")
        self.assertEqual(words[8], "another")

    def test_html_cleaner_strings(self):
        r = HtmlCleaner()
        docs = list(r.process([self.d1["body"]]))
        words = docs[0]
        self.assertEqual(len(words), 53)
        self.assertEqual(words[33], ".")
        self.assertEqual(words[34], "\n")
        self.assertEqual(words[35], " ")
Exemple #3
0
class PipelineTestCase(unittest.TestCase):
    """
    Test a simple pipeline with several pipeline modules
    """
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "A stopword test."})
        self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."})
        self.corpus = Corpus([self.d1, self.d2])

    def test_pipeline(self):
        recorder = Recorder([])
        pipeline = GeneratorPipeline([Uniq(), recorder, StopwordFilter()])
        words = pipeline.process(self.d2.words())
        word_list = list(words)
        self.assertEqual(len(word_list), 4)
        self.assertEqual(word_list[0], "stopword")
        self.assertEqual(word_list[1], "test")
        self.assertEqual(word_list[2], "unique")
        self.assertEqual(word_list[3], ".")

        word_list = recorder.get_data()
        #print word_list
        self.assertEqual(len(word_list), 6)
        self.assertEqual(word_list[0], "a")
        self.assertEqual(word_list[1], "stopword")
        self.assertEqual(word_list[2], "test")
        self.assertEqual(word_list[3], "and")
        self.assertEqual(word_list[4], "unique")
        self.assertEqual(word_list[5], ".")
Exemple #4
0
class StopwordFilterTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "A stopword test."},
                           [StopwordFilter()])

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 3)
        self.assertEqual(words[0], "stopword")
Exemple #5
0
class RedisUniqTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1",
                            "body": "A test of a unique filter pipeline."})

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 8)
        self.assertEqual(words[1], "test")

    def test_redis_uniq(self):
        r = RedisUniq()
        words = list(r.process(self.d1.words()))
        self.assertEqual(len(words), 7)
        self.assertEqual(words[0], "a")
        self.assertEqual(words[1], "test")
        self.assertEqual(words[2], "of")
        self.assertEqual(words[3], "unique")
class StdoutWriterTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1",
                            "body": "A test of a unique filter pipeline."})

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 8)
        self.assertEqual(words[1], "test")
Exemple #7
0
class DocumentTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "This this."})
        self.d2 = Document({"id": "2", "body": "This is another test document."})
        self.d3 = Document({"id": "3", "body": "Two words."})
        self.d4 = Document({"id": "3", "body": "Derp a derp.\nA derp derp."})

    def test_init(self):
        d1 = Document({"id": "1", "body": "This this."})
        self.assertEqual(d1.doc_id, "1")
        self.assertEqual(str(d1), "This this.")
        d2 = Document("This this.")
        self.assertEqual(str(d1), "This this.")

    def test___str__(self):
        self.assertEqual(str(self.d1), "This this.")
        self.assertEqual(str(self.d2), "This is another test document.")
        self.assertEqual(str(self.d3), "Two words.")

    def test_update_text(self):
        self.assertEqual(str(self.d1), "This this.")
        self.d1.update_text("more words")
        self.assertEqual(str(self.d1), "more words")

    def test_words(self):
        words = list(self.d4.words())
        self.assertEqual(len(words), 8)
        self.assertEqual(words[0], "derp")
        self.assertEqual(words[1], "a")
        self.assertEqual(words[2], "derp")
        self.assertEqual(words[3], ".")
        self.assertEqual(words[4], "a")
        self.assertEqual(words[5], "derp")
        self.assertEqual(words[6], "derp")
        self.assertEqual(words[7], ".")

    def test_to_ngrams(self):
        ngrams = self.d4.to_ngrams(2)
        self.assertEqual(ngrams, [("derp", "a"), ("a", "derp"), ("derp", "."),
                                  (".", "a"), ("a", "derp"), ("derp", "derp"),
                                  ("derp", ".")])