def __init__(self): for dir_ in glob(self.master_dir + "/*"): print "\nProcessing", dir_ for essay in glob(dir_ + "/*"): # essays nested in subdirs if essay not in self.essay_vectors.keys(): print "\nDoubleChecking", essay doc = Document(essay, "Wil") doc.document_to_text(essay, essay) # should probably truncate the first "essay" argument to just the filename doc.preprocess_text() doc.statistics() errors = doc.proofread() err_stats = {'grammar': 0, 'suggestion': 0, 'spelling': 0 } try: for err in errors: err_stats[err["type"]] += 1 except TypeError: print "No errors!" token_sentence_ratio = doc.stats['tokens'] / doc.stats['sentences'] self.essay_vectors[essay] = [ err_stats['grammar'], err_stats['suggestion'], err_stats['spelling'], token_sentence_ratio ] print "Completed " + essay + ". Sleeping..." sleep(10)
def test_word_tokenizing(self): text = "This is a test sentence." with open("../process/tmp_test_file.txt", "w") as test_file: test_file.write(text) d = Document("tmp_test_file.txt", "testuser") d.preprocess_text() self.assertEqual(d.preprocessed['tokens'], 6, "word tokenizing failed, incorrect number of tokens")