class TeeTestCase(unittest.TestCase): def setUp(self): self.d1 = Document({"id": "1", "body": "A test of a unique filter pipeline."}) def test_document(self): words = [] for word in self.d1.words(): words.append(word) self.assertEqual(len(words), 8) self.assertEqual(words[1], "test") def document_test(self, words): self.assertEqual(len(words), 8) self.assertEqual(words[0], "a") self.assertEqual(words[1], "test") self.assertEqual(words[2], "of") self.assertEqual(words[3], "a") def test_tee(self): identity = Identity() t = Tee(identity) words = list(t.process(self.d1.words())) self.document_test(words) words2 = list(t.alternate()) self.document_test(words2)
def setUp(self): self.d1 = Document({"id": "1", "body": "A stopword test."}) self.d1.set("categories", ["stopwords"]) self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."}) self.d2.set("categories", ["stopwords"]) self.d3 = Document({"id": "3", "body": "A parsing test."}) self.d3.set("categories", ["parsing"]) self.docs = [self.d1, self.d2, self.d3]
def process(self, documents): for document in documents: if (isinstance(document, Document)): ngrams = document.to_ngrams(self.ngram_size) elif type(document) == str: doc = Document(document) ngrams = doc.to_ngrams(self.ngram_size) uniq = sets.ImmutableSet(ngrams) hashes = [hash(ng) for ng in uniq] if self.attribute: document.set(self.attribute, hashes) yield document else: yield hashes
class HtmlCleanerTestCase(unittest.TestCase): def setUp(self): self.d1 = Document({"id": "1", "body": "A test of a html cleaner pipeline.<br/> Another sentence."}) self.docs = [self.d1] def test_document(self): words = [] for word in self.d1.words(): words.append(word) self.assertEqual(len(words), 13) self.assertEqual(words[1], "test") def test_html_cleaner_docs(self): r = HtmlCleaner() docs = list(r.process(self.docs)) words = list(docs[0].words()) self.assertEqual(len(words), 11) self.assertEqual(words[6], "pipeline") self.assertEqual(words[7], ".") self.assertEqual(words[8], "another") def test_html_cleaner_strings(self): r = HtmlCleaner() docs = list(r.process([self.d1["body"]])) words = docs[0] self.assertEqual(len(words), 53) self.assertEqual(words[33], ".") self.assertEqual(words[34], "\n") self.assertEqual(words[35], " ")
class PipelineTestCase(unittest.TestCase): """ Test a simple pipeline with several pipeline modules """ def setUp(self): self.d1 = Document({"id": "1", "body": "A stopword test."}) self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."}) self.corpus = Corpus([self.d1, self.d2]) def test_pipeline(self): recorder = Recorder([]) pipeline = GeneratorPipeline([Uniq(), recorder, StopwordFilter()]) words = pipeline.process(self.d2.words()) word_list = list(words) self.assertEqual(len(word_list), 4) self.assertEqual(word_list[0], "stopword") self.assertEqual(word_list[1], "test") self.assertEqual(word_list[2], "unique") self.assertEqual(word_list[3], ".") word_list = recorder.get_data() #print word_list self.assertEqual(len(word_list), 6) self.assertEqual(word_list[0], "a") self.assertEqual(word_list[1], "stopword") self.assertEqual(word_list[2], "test") self.assertEqual(word_list[3], "and") self.assertEqual(word_list[4], "unique") self.assertEqual(word_list[5], ".")
def __init__(self, data): self.length = 0 data["created_time"] = datetime.datetime.strptime( data["created_time"], "%Y-%m-%dT%H:%M:%S+0000") + \ datetime.timedelta(hours=-8) time_as_int = self.time_to_int(data["created_time"]) #print data["created_time"] d = {} d["time_as_int"] = time_as_int if "message" in data: d["message"] = data["message"] self.length = len(data["message"]) d["metadata"] = data.copy() self.document = d.copy() if "id" in data: self.doc_id = data["id"] #super(FacebookDocument, self).__init__(data) Document.__init__(self, data) self.body_attribute = FacebookDocument.BodyAttribute
class StopwordFilterTestCase(unittest.TestCase): def setUp(self): self.d1 = Document({"id": "1", "body": "A stopword test."}, [StopwordFilter()]) def test_document(self): words = [] for word in self.d1.words(): words.append(word) self.assertEqual(len(words), 3) self.assertEqual(words[0], "stopword")
class RedisUniqTestCase(unittest.TestCase): def setUp(self): self.d1 = Document({"id": "1", "body": "A test of a unique filter pipeline."}) def test_document(self): words = [] for word in self.d1.words(): words.append(word) self.assertEqual(len(words), 8) self.assertEqual(words[1], "test") def test_redis_uniq(self): r = RedisUniq() words = list(r.process(self.d1.words())) self.assertEqual(len(words), 7) self.assertEqual(words[0], "a") self.assertEqual(words[1], "test") self.assertEqual(words[2], "of") self.assertEqual(words[3], "unique")
class StdoutWriterTestCase(unittest.TestCase): def setUp(self): self.d1 = Document({"id": "1", "body": "A test of a unique filter pipeline."}) def test_document(self): words = [] for word in self.d1.words(): words.append(word) self.assertEqual(len(words), 8) self.assertEqual(words[1], "test")
class DocumentTestCase(unittest.TestCase): def setUp(self): self.d1 = Document({"id": "1", "body": "This this."}) self.d2 = Document({"id": "2", "body": "This is another test document."}) self.d3 = Document({"id": "3", "body": "Two words."}) self.d4 = Document({"id": "3", "body": "Derp a derp.\nA derp derp."}) def test_init(self): d1 = Document({"id": "1", "body": "This this."}) self.assertEqual(d1.doc_id, "1") self.assertEqual(str(d1), "This this.") d2 = Document("This this.") self.assertEqual(str(d1), "This this.") def test___str__(self): self.assertEqual(str(self.d1), "This this.") self.assertEqual(str(self.d2), "This is another test document.") self.assertEqual(str(self.d3), "Two words.") def test_update_text(self): self.assertEqual(str(self.d1), "This this.") self.d1.update_text("more words") self.assertEqual(str(self.d1), "more words") def test_words(self): words = list(self.d4.words()) self.assertEqual(len(words), 8) self.assertEqual(words[0], "derp") self.assertEqual(words[1], "a") self.assertEqual(words[2], "derp") self.assertEqual(words[3], ".") self.assertEqual(words[4], "a") self.assertEqual(words[5], "derp") self.assertEqual(words[6], "derp") self.assertEqual(words[7], ".") def test_to_ngrams(self): ngrams = self.d4.to_ngrams(2) self.assertEqual(ngrams, [("derp", "a"), ("a", "derp"), ("derp", "."), (".", "a"), ("a", "derp"), ("derp", "derp"), ("derp", ".")])
def __init__(self, data): self.length = 0 d = {} if "created_at" in data: #, "%Y-%m-%dT%H:%M:%S+0000") + \ #data["created_time"] = datetime.datetime.strptime( # data["created_at"]) + datetime.timedelta(hours=-8) data["created_time"] = parser.parse(data["created_at"]) #time_as_int = self.time_to_int(data["created_time"]) d["created_time"] = data["created_time"] #print data["created_time"] #d["time_as_int"] = time_as_int if "text" in data: d["text"] = data["text"] self.length = len(data["text"]) d["metadata"] = data.copy() self.document = d.copy() if "id" in data: self.doc_id = str(data["id"]) self.body_attribute = TwitterDocument.BodyAttribute Document.__init__(self, data)
class UnigramIndexTestCase(unittest.TestCase): def setUp(self): self.d1 = Document({"id": "1", "body": "This this."}) self.d2 = Document({"id": "2", "body": "This is another test document."}) self.d3 = Document({"id": "3", "body": "Two words."}) self.d4 = Document({"id": "3", "body": "Derp a derp.\nA derp derp."}) def test_index(self): self.assertEqual(self.d1._index, None) self.assertEqual(self.d2._index, None) self.assertEqual(self.d3._index, None) self.d1.index() self.d2.index() self.d3.index() d1_index = self.d1._index d2_index = self.d2._index d3_index = self.d3._index self.assertNotEqual(d1_index._freq_dist, None) self.assertNotEqual(d2_index._freq_dist, None) self.assertNotEqual(d3_index._freq_dist, None) # periods are considered words by default in NLTK self.assertEqual(len(d1_index._freq_dist), 2) self.assertEqual(d1_index._freq_dist["this"], 2) self.assertEqual(d1_index._freq_dist["."], 1) self.assertEqual(len(d2_index._freq_dist), 6) self.assertEqual(d2_index._freq_dist["this"], 1) self.assertEqual(d2_index._freq_dist["is"], 1) self.assertEqual(d2_index._freq_dist["another"], 1) self.assertEqual(d2_index._freq_dist["test"], 1) self.assertEqual(d2_index._freq_dist["document"], 1) self.assertEqual(d2_index._freq_dist["."], 1) self.assertEqual(len(d3_index._freq_dist), 3) self.assertEqual(d3_index._freq_dist["two"], 1) self.assertEqual(d3_index._freq_dist["words"], 1) self.assertEqual(d3_index._freq_dist["."], 1)
class CategoryToCorpusTestCase(unittest.TestCase): """ Test a simple pipeline with several pipeline modules """ def setUp(self): self.d1 = Document({"id": "1", "body": "A stopword test."}) self.d1.set("categories", ["stopwords"]) self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."}) self.d2.set("categories", ["stopwords"]) self.d3 = Document({"id": "3", "body": "A parsing test."}) self.d3.set("categories", ["parsing"]) self.docs = [self.d1, self.d2, self.d3] def test_category_to_corpus_combined(self): # Test combined mode, which concatenates documents in the same category # into a single document category_to_corpus = CategoryToCorpus() pipeline = GeneratorPipeline([category_to_corpus]) docs = pipeline.process(self.docs) # TODO: add a sink module or something similar to thread all docs/words # through a pipeline for doc in docs: continue # Get the generated corpus corpus = category_to_corpus.post_process() self.assertEqual(len(corpus.categories()), 2) stopwords_docs = corpus["stopwords"] parsing_docs = corpus["parsing"] word_list = list(stopwords_docs.words()) self.assertEqual(len(word_list), 12) self.assertEqual(word_list[0], "a") self.assertEqual(word_list[1], "stopword") self.assertEqual(word_list[2], "test") self.assertEqual(word_list[3], ".") self.assertEqual(word_list[4], "a") self.assertEqual(word_list[5], "stopword") self.assertEqual(word_list[6], "test") self.assertEqual(word_list[7], "and") self.assertEqual(word_list[8], "a") self.assertEqual(word_list[9], "unique") self.assertEqual(word_list[10], "test") self.assertEqual(word_list[11], ".") word_list = list(parsing_docs.words()) self.assertEqual(len(word_list), 4) self.assertEqual(word_list[0], "a") self.assertEqual(word_list[1], "parsing") self.assertEqual(word_list[2], "test") self.assertEqual(word_list[3], ".") def test_category_to_corpus_combined(self): """ Test combined mode, which concatenates documents in the same category into a single document """ category_to_corpus = CategoryToCorpus() pipeline = GeneratorPipeline([category_to_corpus]) docs = pipeline.process(self.docs) # TODO: add a sink module or something similar to thread all docs/words # through a pipeline for doc in docs: continue # Get the generated corpus corpus = category_to_corpus.post_process() self.assertEqual(len(corpus.categories()), 2) stopwords_docs = corpus["stopwords"] parsing_docs = corpus["parsing"] word_list = list(stopwords_docs.words()) self.assertEqual(len(word_list), 12) self.assertEqual(word_list[0], "a") self.assertEqual(word_list[1], "stopword") self.assertEqual(word_list[2], "test") self.assertEqual(word_list[3], ".") self.assertEqual(word_list[4], "a") self.assertEqual(word_list[5], "stopword") self.assertEqual(word_list[6], "test") self.assertEqual(word_list[7], "and") self.assertEqual(word_list[8], "a") self.assertEqual(word_list[9], "unique") self.assertEqual(word_list[10], "test") self.assertEqual(word_list[11], ".") word_list = list(parsing_docs.words()) self.assertEqual(len(word_list), 4) self.assertEqual(word_list[0], "a") self.assertEqual(word_list[1], "parsing") self.assertEqual(word_list[2], "test") self.assertEqual(word_list[3], ".") def test_category_to_corpus_separated(self): """ Test separated mode, which puts documents in the same category into different corpora """ category_to_corpus = CategoryToCorpus(None, None, "categories", None, "separated") pipeline = GeneratorPipeline([category_to_corpus]) docs = pipeline.process(self.docs) # TODO: add a sink module or something similar to thread all docs/words # through a pipeline for doc in docs: continue # Get the generated corpus corpora = category_to_corpus.post_process() self.assertEqual(len(corpora), 2) self.assertEqual(len(corpora["stopwords"]), 2) self.assertEqual(len(corpora["parsing"]), 1)
def setUp(self): self.d1 = Document({"id": "1", "body": "A stopword test."}, [StopwordFilter()])
def setUp(self): self.d1 = Document({"id": "1", "body": "This this."}) self.d2 = Document({"id": "2", "body": "This is another test document."}) self.d3 = Document({"id": "3", "body": "Two words."}) self.d4 = Document({"id": "3", "body": "Derp a derp.\nA derp derp."})
def setUp(self): self.d1 = Document({"id": "1", "body": "A test of a unique filter pipeline."})
def setUp(self): self.d1 = Document({"id": "1", "body": "A stopword test."}) self.d2 = Document({"id": "2", "body": "A stopword test and a unique test."}) self.corpus = Corpus([self.d1, self.d2])
def setUp(self): self.d1 = Document({"id": "1", "body": "A test of a html cleaner pipeline.<br/> Another sentence."}) self.docs = [self.d1]
def setUp(self): self.d1 = Document({"id": "1", "body": "A stopword test."}, [RegexpFilter("^.$")])