def test_munge_complex(self): """Tests munge.split_corpus and munge.write_clean_corpus on a real-world example corpus (Don Quixote). Uses same tests from test_split_corpus_simple and test_write_clean_corpus_simple.""" # do i need to repeat what I'm checking for? or is this sufficient? # corpus_to_documents tests corpus = munge.import_corpus("test_files/quixote.txt") for doc in munge.corpus_to_documents(corpus)[:-1]: # all documents (except last) are between 250 and 500 words self.assertTrue(len(doc.split()) >= 250) self.assertTrue(len(doc.split()) <= 500) # all documents (except last) either end on punctuation or are 500 words. 5 characters back to accommodate # for extra characters like quotes and parentheses self.assertTrue( len(doc.split()) == 500 or "." in doc[-5:] or "!" in doc[-5:] or "?" in doc[-5:]) # write_clean_corpus tests with open("test_files/munged_quixote.txt", "r") as in_file: for i, line in enumerate(in_file): features = line.split("\t") self.assertEqual(features[0], str( self.sample_metadata["quixote"]["ids"][i])) self.assertEqual( features[1], self.sample_metadata["quixote"]["names"][i]) self.assertEqual(len(features), 3)
def test_write_clean_corpus_simple(self): """Tests munge.write_clean_corpus on simple test files. checks: -every line of out file has correct formatting: < unique_id >\t < orig_doc_id >\t < text > -AssertionError is raised when list of unique_ids are not unique""" # every line has correct formatting <unique_id>\t<orig_doc_id>\t<text> with open("test_files/munged_angel_50.txt", "r") as in_file: for i, line in enumerate(in_file): features = line.split("\t") self.assertEqual(features[0], str( self.sample_metadata["angel"]["ids"][i])) self.assertEqual( features[1], self.sample_metadata["angel"]["names"][i]) self.assertEqual(len(features), 3) # ids are unique corpus = munge.import_corpus("test_files/simple_angel_50.txt") split_angels = munge.corpus_to_documents(corpus) angel_ids_shallow = self.sample_metadata["angel"]["ids"][:][:-1] angel_ids_shallow.append(angel_ids_shallow[-1]) with self.assertRaises(AssertionError): # last id is repeated twice munge.write_clean_corpus(split_angels, angel_ids_shallow, self.sample_metadata["angel"]["names"], "test_files/angels_nonunique.txt") # doc names, doc ids, document lists are same length with self.assertRaises(AssertionError): # last id is repeated twice munge.write_clean_corpus(split_angels, angel_ids_shallow[:-1], self.sample_metadata["angel"]["names"], "test_files/angels_nonunique.txt")
def generate_metadata(self, keys, texts): """Creates a dictionary mapping each corpus in keys to a dictionary of ids and names generated for each document in that corpus""" metadata = {} for i, corpus in enumerate(keys): text = munge.corpus_to_documents(munge.import_corpus(texts[i])) ids = list(range(len(text))) names = [corpus + str(x) for x in ids] metadata[corpus] = {"text": text, "ids": ids, "names": names} return metadata
def test_corpus_to_documents_simple(self): """Tests munge.corpus_to_documents on simple test corpora. checks: -function can handle text files and directories of text files -each line (document) is between 250 and 500 words -each line either ends on punctuation or is 500 words""" # import_corpus handles text files and directories containin txt files and other file types corpora = ["test_files/simple_whale_100.txt", "test_files/simple_angel_50.txt", "test_files/"] for filename in corpora: corpus = munge.import_corpus(filename) for doc in munge.corpus_to_documents(corpus)[:-1]: # all documents (except last) are between 250 and 500 words self.assertTrue(len(doc.split()) >= 250) self.assertTrue(len(doc.split()) <= 500) # all documents (except last) either end on punctuation or are 500 words. # searching 5 characters back to account for extra characters like quotes and parentheses self.assertTrue( len(doc.split()) == 500 or "." in doc[-5:] or "!" in doc[-5:] or "?" in doc[-5:])
def _make_mallet_model(self, corpus_filepath, path_to_mallet, remove_stopwords, corpus_language, num_topics, **kwargs): """Returns a gensim-created topic model (class LdaMallet), and assigns class attributes _docs (an OrderedDict containing the preprocessed corpus documents) and _vocabulary (the corpus vocabulary (iter of str)). This function lowercases all words in the corpus, and removes stopwords if remove_stopwords is True. The keys for the document dictionary are unique document ids of the format "doc<i>" where <i> is the number of the document in the corpus.""" munged_corpus = munge.corpus_to_doc_tokens(corpus_filepath) # make corpus lowercase, remove stopwords if remove_stopwords: stop_words = stopwords.words(corpus_language) prepped_corpus = [ [word.lower() for word in doc if word.lower() not in stop_words] for doc in munged_corpus] else: prepped_corpus = [[word.lower() for word in doc] for doc in munged_corpus] # TODO (7/12/19 faunam): make lowercasing corpus optional id_to_word = corpora.Dictionary(prepped_corpus) term_document_frequency = [ id_to_word.doc2bow(doc) for doc in prepped_corpus] mallet_model = LdaMallet(path_to_mallet, corpus=term_document_frequency, id2word=id_to_word, num_topics=num_topics, **kwargs) docs = OrderedDict(("doc" + str(i), " ".join(doc)) for i, doc in enumerate(prepped_corpus)) full_corpus = munge.corpus_to_documents(corpus_filepath) full_docs = OrderedDict(("doc" + str(i), doc) for i, doc in enumerate(full_corpus)) self._docs = docs self._full_docs = full_docs self._vocabulary = [word for word in id_to_word.values()] return mallet_model