def test_016(self): """ Document document setter - valid text file """ document = Document() document.document = "test.txt" self.assertEqual(document.name, "test") self.assertEqual(len(document), 1) self.assertEqual(document.text, ["foo"]) os.remove("test1.txt")
def test_023(self): """ Document text setter """ document = Document("files/4page.pdf", "./") document.text[0] = "goo" # TODO #self.assertEqual(document.text[0], "goo") for i in range(1, 5): os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".json")
def test_064(self): """ config stem is valid """ document = Document("files/4page.pdf", "./", config=['stem=gap']) document = Document("files/4page.pdf", "./", config=['stem=porter']) document = Document("files/4page.pdf", "./", config=['stem=snowball']) document = Document("files/4page.pdf", "./", config=['stem=lancaster']) document = Document("files/4page.pdf", "./", config=['stem=lemma']) for i in range(1, 5): os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".json")
def test_018(self): """ Document document setter - valid PDF file with page directory """ document = Document() document.dir = "./" document.document = "files/4page.pdf" self.assertEqual(document.name, "4page") self.assertEqual(len(document), 4) for i in range(1, 5): self.assertTrue(os.path.isfile("4page" + str(i) + ".pdf")) self.assertTrue(os.path.isfile("4page" + str(i) + ".txt")) self.assertTrue(os.path.isfile("4page" + str(i) + ".json")) for i in range(1, 5): os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".json")
def test_074(self): """ Document - lang type, page 1 """ document = Document("files/lang-en.txt", "./") self.assertEquals(document.lang, 'en') document = Document("files/lang-es.txt", "./") self.assertEquals(document.lang, 'es') document = Document("files/lang-fr.txt", "./") self.assertEquals(document.lang, 'fr') document = Document("files/lang-de.txt", "./") self.assertEquals(document.lang, 'de') document = Document("files/lang-it.txt", "./") self.assertEquals(document.lang, 'it') for lang in ['en', 'es', 'fr', 'de', 'it']: os.remove("lang-" + lang + "1.txt") os.remove("lang-" + lang + "1.json")
def test_008(self): """ Document constructor - keyword argument: document """ document = Document(document="test.txt") self.assertEqual(document.name, "test") self.assertEqual(len(document), 1) os.remove("test1.txt") os.remove("test1.json")
def test_011(self): """ Document constructor - store single page text file for raw text document """ document = Document("test.txt", "./") self.assertEqual(document.name, "test") self.assertTrue(os.path.isfile("test1.txt")) os.remove("test1.txt") os.remove("test1.json")
def test_026(self): """ Document [] setter """ document = Document("test.txt") page = Page(text='hello world') document[0] = page self.assertEqual(document[0].text, "hello world") os.remove("test1.txt")
def test_067(self): """ config segment image """ document = Document('files/text.png', './', config=['segment']) self.assertEquals(len(document[0].words), 7) os.remove('text1.png') os.remove('text1.txt') os.remove('text1.json')
def test_066(self): """ config segment pdf """ document = Document('files/invoice.pdf', './', config=['segment']) self.assertEquals(len(document[0].words), 15) os.remove('invoice1.pdf') os.remove('invoice1.txt') os.remove('invoice1.json')
def test_028(self): """ Document [] setter - not an int index """ document = Document("test.txt") page = Page(text='hello world') with pytest.raises(TypeError): document['abc'] = page os.remove("test1.txt")
def test_053(self): # page.path for .txt file document = Document("test.txt") self.assertTrue(os.path.isfile("test1.json")) self.assertEquals(document[0].path, "./test1.txt") os.remove("test1.txt") os.remove("test1.json")
def test_012(self): """ Document constructor - non-ascii characters in document (UTF-8 encoding) """ document = Document("files/7page.pdf", "./") self.assertEqual(document[0].text.strip()[0:7], "MEDICAL") for i in range(1, 8): os.remove("7page" + str(i) + ".pdf") os.remove("7page" + str(i) + ".txt") os.remove("7page" + str(i) + ".json")
def test_037(self): """ Document type getter - PDF """ document = Document("files/4page.pdf", "./") self.assertEqual(document.type, "pdf") for i in range(1, 5): os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".json")
def test_035(self): """ Document size getter - non-zero """ document = Document("files/4page.pdf", "./") self.assertEqual(document.size, 32667) for i in range(1, 5): os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".json")
def test_058(self): """ config is empty """ document = Document("files/4page.pdf", "./", config=[]) self.assertTrue(document.bagOfWords != None) for i in range(1, 5): os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".json")
def test_051(self): """ async processing """ document = Document("files/invoice.pdf", "./", self.done) time.sleep(6) self.assertTrue(self.isdone) os.remove("invoice1.pdf") os.remove("invoice1.txt") os.remove("invoice1.json")
def test_025(self): """ Document [] getter - index out of range """ document = Document("files/4page.pdf", "./") self.assertEqual(document[4], None) for i in range(1, 5): os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".json")
def test_077(self): """ Document - French Scanned PDF """ document = Document("files/french-scan.pdf", "./") self.assertEquals(document.lang, 'fr') self.assertEquals(document.scanned, (False, 0)) for i in range(1, 9): os.remove("french-scan" + str(i) + ".pdf") os.remove("french-scan" + str(i) + ".txt") os.remove("french-scan" + str(i) + ".json")
def test_075(self): """ Document - Spanish PDF """ document = Document("files/spanish.pdf", "./") self.assertEquals(document.lang, 'es') self.assertEquals(len(document), 2) for i in range(1, 3): os.remove("spanish" + str(i) + ".pdf") os.remove("spanish" + str(i) + ".txt") os.remove("spanish" + str(i) + ".json")
def test_076(self): """ Document - French PDF """ document = Document("files/french.pdf", "./") self.assertEquals(document.lang, 'fr') self.assertEquals(len(document), 2) for i in range(1, 3): os.remove("french" + str(i) + ".pdf") os.remove("french" + str(i) + ".txt") os.remove("french" + str(i) + ".json")
def test_054(self): """ bag of words / freqDist """ document = Document("files/4page.pdf", "./") self.assertTrue(document.bagOfWords != None) self.assertTrue(document.freqDist != None) for i in range(1, 5): os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".json")
def test_013(self): """ Document constructor - create page directory """ document = Document("files/4page.pdf", "tests2") self.assertTrue(os.path.isdir("tests2")) for i in range(1, 5): os.remove("tests2/4page" + str(i) + ".pdf") os.remove("tests2/4page" + str(i) + ".txt") os.remove("tests2/4page" + str(i) + ".json") os.removedirs("tests2")
def test_071(self): """ config - spell checker - norvig """ with open("spell.txt", "w") as f: f.write("mispell speling similiar") Document.WORDDICT = 'norvig' document = Document('spell.txt') page = document[0] os.remove('spell.txt') os.remove('spell1.txt') os.remove('spell1.json')
def test_041(self): """ Document - invoice PDF """ document = Document("files/invoice.pdf", "./") self.assertEqual(len(document), 1) self.assertTrue(os.path.isfile("invoice1.txt")) self.assertTrue(os.path.isfile("invoice1.pdf")) self.assertTrue(os.path.isfile("invoice1.json")) os.remove("invoice1.txt") os.remove("invoice1.pdf") os.remove("invoice1.json")
def test_059(self): """ config has multiple entries """ document = Document("files/4page.pdf", "./", config=['bare', 'pos', 'roman']) self.assertTrue(document.bagOfWords != None) for i in range(1, 5): os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".json")
def test_040(self): """ Document - color PDF with overlay """ document = Document("files/5page.pdf", "./") self.assertEqual(len(document), 5) for i in range(1, 6): self.assertTrue(os.path.isfile("5page" + str(i) + ".txt")) self.assertTrue(os.path.isfile("5page" + str(i) + ".json")) for i in range(1, 6): os.remove("5page" + str(i) + ".txt") os.remove("5page" + str(i) + ".pdf") os.remove("5page" + str(i) + ".json")
def test_065(self): """ config segment txt """ document = Document('files/segment_para.txt', './', config=['segment']) self.assertEquals(document[0].size, 91) self.assertEquals( document[0].text, 'This is a first paragraph\nand continues to next line.\n\nThen this is the second\nparagraph.' ) self.assertEquals(len(document[0].words), 2) os.remove('segment_para1.txt') os.remove('segment_para1.json')
def test_022(self): """ Document text getter - PDF file """ document = Document("files/4page.pdf", "./") self.assertEqual(document.text[0].strip()[0:6], "TIER 1") self.assertEqual(document.text[1].strip()[0:15], "COVERED MEDICAL") self.assertEqual(document.text[2].strip()[0:14], "Emergency mean") self.assertEqual(document.text[3].strip()[0:15], "Maximum Benefit") for i in range(1, 5): os.remove("4page" + str(i) + ".pdf") os.remove("4page" + str(i) + ".txt") os.remove("4page" + str(i) + ".json")
def test_005(self): """ Document Constructor - document = valid text document """ document = Document("test.txt") self.assertEqual(document.document, "test.txt") self.assertEqual(document.name, "test") self.assertEqual(len(document), 1) self.assertEqual(document.text, ["foo"]) self.assertTrue(os.path.isfile("test1.txt")) self.assertTrue(os.path.isfile("test1.json")) os.remove("test1.txt") os.remove("test1.json")