Example #1
0
class TeeTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1",
                            "body": "A test of a unique filter pipeline."})

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 8)
        self.assertEqual(words[1], "test")

    def document_test(self, words):
        self.assertEqual(len(words), 8)
        self.assertEqual(words[0], "a")
        self.assertEqual(words[1], "test")
        self.assertEqual(words[2], "of")
        self.assertEqual(words[3], "a")

    def test_tee(self):
        identity = Identity()
        t = Tee(identity)
        words = list(t.process(self.d1.words()))
        self.document_test(words)
        words2 = list(t.alternate())
        self.document_test(words2)
 def setUp(self):
     self.d1 = Document({"id": "1", "body": "A stopword test."})
     self.d1.set("categories", ["stopwords"])
     self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."})
     self.d2.set("categories", ["stopwords"])
     self.d3 = Document({"id": "3", "body": "A parsing test."})
     self.d3.set("categories", ["parsing"])
     self.docs = [self.d1, self.d2, self.d3]
Example #3
0
 def process(self, documents):
     for document in documents:
         if (isinstance(document, Document)):
             ngrams = document.to_ngrams(self.ngram_size)
         elif type(document) == str:
             doc = Document(document)
             ngrams = doc.to_ngrams(self.ngram_size)
         uniq = sets.ImmutableSet(ngrams)
         hashes = [hash(ng) for ng in uniq]
         if self.attribute:
             document.set(self.attribute, hashes)
             yield document
         else:
             yield hashes
Example #4
0
class HtmlCleanerTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1",
                            "body": "A test of a html cleaner pipeline.<br/> Another sentence."})
        self.docs = [self.d1]

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 13)
        self.assertEqual(words[1], "test")

    def test_html_cleaner_docs(self):
        r = HtmlCleaner()
        docs = list(r.process(self.docs))
        words = list(docs[0].words())
        self.assertEqual(len(words), 11)
        self.assertEqual(words[6], "pipeline")
        self.assertEqual(words[7], ".")
        self.assertEqual(words[8], "another")

    def test_html_cleaner_strings(self):
        r = HtmlCleaner()
        docs = list(r.process([self.d1["body"]]))
        words = docs[0]
        self.assertEqual(len(words), 53)
        self.assertEqual(words[33], ".")
        self.assertEqual(words[34], "\n")
        self.assertEqual(words[35], " ")
Example #5
0
class PipelineTestCase(unittest.TestCase):
    """
    Test a simple pipeline with several pipeline modules
    """
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "A stopword test."})
        self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."})
        self.corpus = Corpus([self.d1, self.d2])

    def test_pipeline(self):
        recorder = Recorder([])
        pipeline = GeneratorPipeline([Uniq(), recorder, StopwordFilter()])
        words = pipeline.process(self.d2.words())
        word_list = list(words)
        self.assertEqual(len(word_list), 4)
        self.assertEqual(word_list[0], "stopword")
        self.assertEqual(word_list[1], "test")
        self.assertEqual(word_list[2], "unique")
        self.assertEqual(word_list[3], ".")

        word_list = recorder.get_data()
        #print word_list
        self.assertEqual(len(word_list), 6)
        self.assertEqual(word_list[0], "a")
        self.assertEqual(word_list[1], "stopword")
        self.assertEqual(word_list[2], "test")
        self.assertEqual(word_list[3], "and")
        self.assertEqual(word_list[4], "unique")
        self.assertEqual(word_list[5], ".")
Example #6
0
 def __init__(self, data):
     self.length = 0
     data["created_time"] = datetime.datetime.strptime(
         data["created_time"], "%Y-%m-%dT%H:%M:%S+0000") + \
         datetime.timedelta(hours=-8)
     time_as_int = self.time_to_int(data["created_time"])
     #print data["created_time"]
     d = {}
     d["time_as_int"] = time_as_int
     if "message" in data:
         d["message"] = data["message"]
         self.length = len(data["message"])
     d["metadata"] = data.copy()
     self.document = d.copy()
     if "id" in data:
         self.doc_id = data["id"]
     #super(FacebookDocument, self).__init__(data)
     Document.__init__(self, data)
     self.body_attribute = FacebookDocument.BodyAttribute
Example #7
0
class StopwordFilterTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "A stopword test."},
                           [StopwordFilter()])

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 3)
        self.assertEqual(words[0], "stopword")
Example #8
0
class RedisUniqTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1",
                            "body": "A test of a unique filter pipeline."})

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 8)
        self.assertEqual(words[1], "test")

    def test_redis_uniq(self):
        r = RedisUniq()
        words = list(r.process(self.d1.words()))
        self.assertEqual(len(words), 7)
        self.assertEqual(words[0], "a")
        self.assertEqual(words[1], "test")
        self.assertEqual(words[2], "of")
        self.assertEqual(words[3], "unique")
Example #9
0
class StdoutWriterTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1",
                            "body": "A test of a unique filter pipeline."})

    def test_document(self):
        words = []
        for word in self.d1.words():
            words.append(word)
        self.assertEqual(len(words), 8)
        self.assertEqual(words[1], "test")
Example #10
0
class DocumentTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "This this."})
        self.d2 = Document({"id": "2", "body": "This is another test document."})
        self.d3 = Document({"id": "3", "body": "Two words."})
        self.d4 = Document({"id": "3", "body": "Derp a derp.\nA derp derp."})

    def test_init(self):
        d1 = Document({"id": "1", "body": "This this."})
        self.assertEqual(d1.doc_id, "1")
        self.assertEqual(str(d1), "This this.")
        d2 = Document("This this.")
        self.assertEqual(str(d1), "This this.")

    def test___str__(self):
        self.assertEqual(str(self.d1), "This this.")
        self.assertEqual(str(self.d2), "This is another test document.")
        self.assertEqual(str(self.d3), "Two words.")

    def test_update_text(self):
        self.assertEqual(str(self.d1), "This this.")
        self.d1.update_text("more words")
        self.assertEqual(str(self.d1), "more words")

    def test_words(self):
        words = list(self.d4.words())
        self.assertEqual(len(words), 8)
        self.assertEqual(words[0], "derp")
        self.assertEqual(words[1], "a")
        self.assertEqual(words[2], "derp")
        self.assertEqual(words[3], ".")
        self.assertEqual(words[4], "a")
        self.assertEqual(words[5], "derp")
        self.assertEqual(words[6], "derp")
        self.assertEqual(words[7], ".")

    def test_to_ngrams(self):
        ngrams = self.d4.to_ngrams(2)
        self.assertEqual(ngrams, [("derp", "a"), ("a", "derp"), ("derp", "."),
                                  (".", "a"), ("a", "derp"), ("derp", "derp"),
                                  ("derp", ".")])
Example #11
0
 def __init__(self, data):
     self.length = 0
     d = {}
     if "created_at" in data:
         #, "%Y-%m-%dT%H:%M:%S+0000") + \
         #data["created_time"] = datetime.datetime.strptime(
         #    data["created_at"]) + datetime.timedelta(hours=-8)
         data["created_time"] = parser.parse(data["created_at"])
         #time_as_int = self.time_to_int(data["created_time"])
         d["created_time"] = data["created_time"]
     #print data["created_time"]
     #d["time_as_int"] = time_as_int
     if "text" in data:
         d["text"] = data["text"]
         self.length = len(data["text"])
     d["metadata"] = data.copy()
     self.document = d.copy()
     if "id" in data:
         self.doc_id = str(data["id"])
     self.body_attribute = TwitterDocument.BodyAttribute
     Document.__init__(self, data)
Example #12
0
class UnigramIndexTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "This this."})
        self.d2 = Document({"id": "2", "body": "This is another test document."})
        self.d3 = Document({"id": "3", "body": "Two words."})
        self.d4 = Document({"id": "3", "body": "Derp a derp.\nA derp derp."})

    def test_index(self):
        self.assertEqual(self.d1._index, None)
        self.assertEqual(self.d2._index, None)
        self.assertEqual(self.d3._index, None)
        self.d1.index()
        self.d2.index()
        self.d3.index()
        d1_index = self.d1._index
        d2_index = self.d2._index
        d3_index = self.d3._index
        self.assertNotEqual(d1_index._freq_dist, None)
        self.assertNotEqual(d2_index._freq_dist, None)
        self.assertNotEqual(d3_index._freq_dist, None)

        # periods are considered words by default in NLTK
        self.assertEqual(len(d1_index._freq_dist), 2)
        self.assertEqual(d1_index._freq_dist["this"], 2)
        self.assertEqual(d1_index._freq_dist["."], 1)

        self.assertEqual(len(d2_index._freq_dist), 6)
        self.assertEqual(d2_index._freq_dist["this"], 1)
        self.assertEqual(d2_index._freq_dist["is"], 1)
        self.assertEqual(d2_index._freq_dist["another"], 1)
        self.assertEqual(d2_index._freq_dist["test"], 1)
        self.assertEqual(d2_index._freq_dist["document"], 1)
        self.assertEqual(d2_index._freq_dist["."], 1)

        self.assertEqual(len(d3_index._freq_dist), 3)
        self.assertEqual(d3_index._freq_dist["two"], 1)
        self.assertEqual(d3_index._freq_dist["words"], 1)
        self.assertEqual(d3_index._freq_dist["."], 1)
class CategoryToCorpusTestCase(unittest.TestCase):
    """
    Test a simple pipeline with several pipeline modules
    """
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "A stopword test."})
        self.d1.set("categories", ["stopwords"])
        self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."})
        self.d2.set("categories", ["stopwords"])
        self.d3 = Document({"id": "3", "body": "A parsing test."})
        self.d3.set("categories", ["parsing"])
        self.docs = [self.d1, self.d2, self.d3]

    def test_category_to_corpus_combined(self):
        # Test combined mode, which concatenates documents in the same category
        # into a single document
        category_to_corpus = CategoryToCorpus()
        pipeline = GeneratorPipeline([category_to_corpus])
        docs = pipeline.process(self.docs)
        # TODO: add a sink module or something similar to thread all docs/words
        # through a pipeline
        for doc in docs:
            continue
        # Get the generated corpus
        corpus = category_to_corpus.post_process()
        self.assertEqual(len(corpus.categories()), 2)
        stopwords_docs = corpus["stopwords"]
        parsing_docs = corpus["parsing"]

        word_list = list(stopwords_docs.words())
        self.assertEqual(len(word_list), 12)
        self.assertEqual(word_list[0], "a")
        self.assertEqual(word_list[1], "stopword")
        self.assertEqual(word_list[2], "test")
        self.assertEqual(word_list[3], ".")
        self.assertEqual(word_list[4], "a")
        self.assertEqual(word_list[5], "stopword")
        self.assertEqual(word_list[6], "test")
        self.assertEqual(word_list[7], "and")
        self.assertEqual(word_list[8], "a")
        self.assertEqual(word_list[9], "unique")
        self.assertEqual(word_list[10], "test")
        self.assertEqual(word_list[11], ".")

        word_list = list(parsing_docs.words())
        self.assertEqual(len(word_list), 4)
        self.assertEqual(word_list[0], "a")
        self.assertEqual(word_list[1], "parsing")
        self.assertEqual(word_list[2], "test")
        self.assertEqual(word_list[3], ".")

    def test_category_to_corpus_combined(self):
        """
        Test combined mode, which concatenates documents in the same category
        into a single document
        """
        category_to_corpus = CategoryToCorpus()
        pipeline = GeneratorPipeline([category_to_corpus])
        docs = pipeline.process(self.docs)
        # TODO: add a sink module or something similar to thread all docs/words
        # through a pipeline
        for doc in docs:
            continue
        # Get the generated corpus
        corpus = category_to_corpus.post_process()
        self.assertEqual(len(corpus.categories()), 2)
        stopwords_docs = corpus["stopwords"]
        parsing_docs = corpus["parsing"]

        word_list = list(stopwords_docs.words())
        self.assertEqual(len(word_list), 12)
        self.assertEqual(word_list[0], "a")
        self.assertEqual(word_list[1], "stopword")
        self.assertEqual(word_list[2], "test")
        self.assertEqual(word_list[3], ".")
        self.assertEqual(word_list[4], "a")
        self.assertEqual(word_list[5], "stopword")
        self.assertEqual(word_list[6], "test")
        self.assertEqual(word_list[7], "and")
        self.assertEqual(word_list[8], "a")
        self.assertEqual(word_list[9], "unique")
        self.assertEqual(word_list[10], "test")
        self.assertEqual(word_list[11], ".")

        word_list = list(parsing_docs.words())
        self.assertEqual(len(word_list), 4)
        self.assertEqual(word_list[0], "a")
        self.assertEqual(word_list[1], "parsing")
        self.assertEqual(word_list[2], "test")
        self.assertEqual(word_list[3], ".")

    def test_category_to_corpus_separated(self):
        """
        Test separated mode, which puts documents in the same category into different
        corpora
        """
        category_to_corpus = CategoryToCorpus(None, None, "categories", None,
                                              "separated")
        pipeline = GeneratorPipeline([category_to_corpus])
        docs = pipeline.process(self.docs)
        # TODO: add a sink module or something similar to thread all docs/words
        # through a pipeline
        for doc in docs:
            continue
        # Get the generated corpus
        corpora = category_to_corpus.post_process()
        self.assertEqual(len(corpora), 2)
        self.assertEqual(len(corpora["stopwords"]), 2)
        self.assertEqual(len(corpora["parsing"]), 1)
Example #14
0
 def setUp(self):
     self.d1 = Document({"id": "1", "body": "A stopword test."},
                        [StopwordFilter()])
Example #15
0
 def setUp(self):
     self.d1 = Document({"id": "1", "body": "This this."})
     self.d2 = Document({"id": "2", "body": "This is another test document."})
     self.d3 = Document({"id": "3", "body": "Two words."})
     self.d4 = Document({"id": "3", "body": "Derp a derp.\nA derp derp."})
Example #16
0
 def setUp(self):
     self.d1 = Document({"id": "1",
                         "body": "A test of a unique filter pipeline."})
Example #17
0
 def setUp(self):
     self.d1 = Document({"id": "1", "body": "A stopword test."})
     self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."})
     self.corpus = Corpus([self.d1, self.d2])
Example #18
0
 def setUp(self):
     self.d1 = Document({"id": "1",
                         "body": "A test of a html cleaner pipeline.<br/> Another sentence."})
     self.docs = [self.d1]
Example #19
0
 def setUp(self):
     self.d1 = Document({"id": "1", "body": "A stopword test."},
                        [RegexpFilter("^.$")])