Example #1
0
    def test_munge_complex(self):
        """Tests munge.split_corpus and munge.write_clean_corpus on a real-world
        example corpus (Don Quixote). Uses same tests from test_split_corpus_simple
        and test_write_clean_corpus_simple."""  # do i need to repeat what I'm checking for? or is this sufficient?

        # corpus_to_documents tests
        corpus = munge.import_corpus("test_files/quixote.txt")
        for doc in munge.corpus_to_documents(corpus)[:-1]:
            # all documents (except last) are between 250 and 500 words
            self.assertTrue(len(doc.split()) >= 250)
            self.assertTrue(len(doc.split()) <= 500)
            # all documents (except last) either end on punctuation or are 500 words. 5 characters back to accommodate
            # for extra characters like quotes and parentheses
            self.assertTrue(
                len(doc.split()) == 500 or "." in doc[-5:] or "!" in doc[-5:] or "?" in doc[-5:])

        # write_clean_corpus tests
        with open("test_files/munged_quixote.txt", "r") as in_file:
            for i, line in enumerate(in_file):
                features = line.split("\t")
                self.assertEqual(features[0], str(
                    self.sample_metadata["quixote"]["ids"][i]))
                self.assertEqual(
                    features[1], self.sample_metadata["quixote"]["names"][i])
                self.assertEqual(len(features), 3)
Example #2
0
    def test_write_clean_corpus_simple(self):
        """Tests munge.write_clean_corpus on simple test files. checks:
                -every line of out file has correct formatting: < unique_id >\t < orig_doc_id >\t < text >
                -AssertionError is raised when list of unique_ids are not unique"""
        # every line has correct formatting <unique_id>\t<orig_doc_id>\t<text>
        with open("test_files/munged_angel_50.txt", "r") as in_file:
            for i, line in enumerate(in_file):
                features = line.split("\t")
                self.assertEqual(features[0], str(
                    self.sample_metadata["angel"]["ids"][i]))
                self.assertEqual(
                    features[1], self.sample_metadata["angel"]["names"][i])
                self.assertEqual(len(features), 3)

        # ids are unique
        corpus = munge.import_corpus("test_files/simple_angel_50.txt")
        split_angels = munge.corpus_to_documents(corpus)
        angel_ids_shallow = self.sample_metadata["angel"]["ids"][:][:-1]
        angel_ids_shallow.append(angel_ids_shallow[-1])
        with self.assertRaises(AssertionError):  # last id is repeated twice
            munge.write_clean_corpus(split_angels, angel_ids_shallow, self.sample_metadata["angel"]["names"],
                                     "test_files/angels_nonunique.txt")

        # doc names, doc ids, document lists are same length
        with self.assertRaises(AssertionError):  # last id is repeated twice
            munge.write_clean_corpus(split_angels, angel_ids_shallow[:-1], self.sample_metadata["angel"]["names"],
                                     "test_files/angels_nonunique.txt")
Example #3
0
 def generate_metadata(self, keys, texts):
     """Creates a dictionary mapping each corpus in keys to a dictionary of ids and names
     generated for each document in that corpus"""
     metadata = {}
     for i, corpus in enumerate(keys):
         text = munge.corpus_to_documents(munge.import_corpus(texts[i]))
         ids = list(range(len(text)))
         names = [corpus + str(x) for x in ids]
         metadata[corpus] = {"text": text, "ids": ids, "names": names}
     return metadata
Example #4
0
 def test_corpus_to_doc_tokens_simple(self):
     """Tests munge._corpus_to_doc_tokens on simple test corpora. checks:
     -function can handle text files and directories of text files
     -each element(document) is an array containing between 250 and 500 strings (tokens)
     -no string contains punctuation"""
     # handles text files and directories containin txt files and other file types
     corpora = ["test_files/simple_whale_100.txt",
                "test_files/simple_angel_50.txt", "test_files/"]
     for filename in corpora:
         corpus = munge.import_corpus(filename)
         for doc in munge.corpus_to_doc_tokens(corpus):
             # all documents (except last) are between 250 and 500 tokens
             self.assertTrue(len(doc) >= 250)
             self.assertTrue(len(doc) <= 500)
             # no punctuation in any tokens
             for token in doc:
                 token_no_punc = token.translate(
                     str.maketrans('', '', string.punctuation + "—"))
                 self.assertTrue(token == token_no_punc)
Example #5
0
    def test_corpus_to_documents_simple(self):
        """Tests munge.corpus_to_documents on simple test corpora. checks:
                -function can handle text files and directories of text files
                -each line (document) is between 250 and 500 words
                -each line either ends on punctuation or is 500 words"""

        # import_corpus handles text files and directories containin txt files and other file types
        corpora = ["test_files/simple_whale_100.txt",
                   "test_files/simple_angel_50.txt", "test_files/"]
        for filename in corpora:
            corpus = munge.import_corpus(filename)
            for doc in munge.corpus_to_documents(corpus)[:-1]:
                # all documents (except last) are between 250 and 500 words
                self.assertTrue(len(doc.split()) >= 250)
                self.assertTrue(len(doc.split()) <= 500)
                # all documents (except last) either end on punctuation or are 500 words.
                # searching 5 characters back to account for extra characters like quotes and parentheses
                self.assertTrue(
                    len(doc.split()) == 500 or "." in doc[-5:] or "!" in doc[-5:] or "?" in doc[-5:])