Example #1
0
 def test_016(self):
     """ Document document setter - valid text file """
     document = Document()
     document.document = "test.txt"
     self.assertEqual(document.name, "test")
     self.assertEqual(len(document), 1)
     self.assertEqual(document.text, ["foo"])
     os.remove("test1.txt")
Example #2
0
 def test_023(self):
     """ Document text setter """
     document = Document("files/4page.pdf", "./")
     document.text[0] = "goo"
     # TODO
     #self.assertEqual(document.text[0], "goo")
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
Example #3
0
 def test_064(self):
     """ config stem is valid """
     document = Document("files/4page.pdf", "./", config=['stem=gap'])
     document = Document("files/4page.pdf", "./", config=['stem=porter'])
     document = Document("files/4page.pdf", "./", config=['stem=snowball'])
     document = Document("files/4page.pdf", "./", config=['stem=lancaster'])
     document = Document("files/4page.pdf", "./", config=['stem=lemma'])
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".json")
Example #4
0
 def test_018(self):
     """ Document document setter - valid PDF file with page directory """
     document = Document()
     document.dir = "./"
     document.document = "files/4page.pdf"
     self.assertEqual(document.name, "4page")
     self.assertEqual(len(document), 4)
     for i in range(1, 5):
         self.assertTrue(os.path.isfile("4page" + str(i) + ".pdf"))
         self.assertTrue(os.path.isfile("4page" + str(i) + ".txt"))
         self.assertTrue(os.path.isfile("4page" + str(i) + ".json"))
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
Example #5
0
 def test_074(self):
     """ Document - lang type, page 1 """
     document = Document("files/lang-en.txt", "./")
     self.assertEquals(document.lang, 'en')
     document = Document("files/lang-es.txt", "./")
     self.assertEquals(document.lang, 'es')
     document = Document("files/lang-fr.txt", "./")
     self.assertEquals(document.lang, 'fr')
     document = Document("files/lang-de.txt", "./")
     self.assertEquals(document.lang, 'de')
     document = Document("files/lang-it.txt", "./")
     self.assertEquals(document.lang, 'it')
     for lang in ['en', 'es', 'fr', 'de', 'it']:
         os.remove("lang-" + lang + "1.txt")
         os.remove("lang-" + lang + "1.json")
Example #6
0
 def test_008(self):
     """ Document constructor - keyword argument: document """
     document = Document(document="test.txt")
     self.assertEqual(document.name, "test")
     self.assertEqual(len(document), 1)
     os.remove("test1.txt")
     os.remove("test1.json")
Example #7
0
 def test_011(self):
     """ Document constructor - store single page text file for raw text document """
     document = Document("test.txt", "./")
     self.assertEqual(document.name, "test")
     self.assertTrue(os.path.isfile("test1.txt"))
     os.remove("test1.txt")
     os.remove("test1.json")
Example #8
0
 def test_026(self):
     """ Document [] setter """
     document = Document("test.txt")
     page = Page(text='hello world')
     document[0] = page
     self.assertEqual(document[0].text, "hello world")
     os.remove("test1.txt")
Example #9
0
 def test_067(self):
     """ config segment image """
     document = Document('files/text.png', './', config=['segment'])
     self.assertEquals(len(document[0].words), 7)
     os.remove('text1.png')
     os.remove('text1.txt')
     os.remove('text1.json')
Example #10
0
 def test_066(self):
     """ config segment pdf """
     document = Document('files/invoice.pdf', './', config=['segment'])
     self.assertEquals(len(document[0].words), 15)
     os.remove('invoice1.pdf')
     os.remove('invoice1.txt')
     os.remove('invoice1.json')
Example #11
0
 def test_028(self):
     """ Document [] setter - not an int index """
     document = Document("test.txt")
     page = Page(text='hello world')
     with pytest.raises(TypeError):
         document['abc'] = page
     os.remove("test1.txt")
Example #12
0
 def test_053(self):
     # page.path for .txt file
     document = Document("test.txt")
     self.assertTrue(os.path.isfile("test1.json"))
     self.assertEquals(document[0].path, "./test1.txt")
     os.remove("test1.txt")
     os.remove("test1.json")
Example #13
0
 def test_012(self):
     """ Document constructor - non-ascii characters in document (UTF-8 encoding) """
     document = Document("files/7page.pdf", "./")
     self.assertEqual(document[0].text.strip()[0:7], "MEDICAL")
     for i in range(1, 8):
         os.remove("7page" + str(i) + ".pdf")
         os.remove("7page" + str(i) + ".txt")
         os.remove("7page" + str(i) + ".json")
Example #14
0
 def test_037(self):
     """ Document type getter - PDF """
     document = Document("files/4page.pdf", "./")
     self.assertEqual(document.type, "pdf")
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
Example #15
0
 def test_035(self):
     """ Document size getter - non-zero """
     document = Document("files/4page.pdf", "./")
     self.assertEqual(document.size, 32667)
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
Example #16
0
 def test_058(self):
     """ config is empty """
     document = Document("files/4page.pdf", "./", config=[])
     self.assertTrue(document.bagOfWords != None)
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".json")
Example #17
0
 def test_051(self):
     """ async processing """
     document = Document("files/invoice.pdf", "./", self.done)
     time.sleep(6)
     self.assertTrue(self.isdone)
     os.remove("invoice1.pdf")
     os.remove("invoice1.txt")
     os.remove("invoice1.json")
Example #18
0
 def test_025(self):
     """ Document [] getter - index out of range """
     document = Document("files/4page.pdf", "./")
     self.assertEqual(document[4], None)
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
Example #19
0
 def test_077(self):
     """ Document - French Scanned PDF """
     document = Document("files/french-scan.pdf", "./")
     self.assertEquals(document.lang, 'fr')
     self.assertEquals(document.scanned, (False, 0))
     for i in range(1, 9):
         os.remove("french-scan" + str(i) + ".pdf")
         os.remove("french-scan" + str(i) + ".txt")
         os.remove("french-scan" + str(i) + ".json")
Example #20
0
 def test_075(self):
     """ Document - Spanish PDF """
     document = Document("files/spanish.pdf", "./")
     self.assertEquals(document.lang, 'es')
     self.assertEquals(len(document), 2)
     for i in range(1, 3):
         os.remove("spanish" + str(i) + ".pdf")
         os.remove("spanish" + str(i) + ".txt")
         os.remove("spanish" + str(i) + ".json")
Example #21
0
 def test_076(self):
     """ Document - French PDF """
     document = Document("files/french.pdf", "./")
     self.assertEquals(document.lang, 'fr')
     self.assertEquals(len(document), 2)
     for i in range(1, 3):
         os.remove("french" + str(i) + ".pdf")
         os.remove("french" + str(i) + ".txt")
         os.remove("french" + str(i) + ".json")
Example #22
0
 def test_054(self):
     """ bag of words / freqDist """
     document = Document("files/4page.pdf", "./")
     self.assertTrue(document.bagOfWords != None)
     self.assertTrue(document.freqDist != None)
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".json")
Example #23
0
 def test_013(self):
     """ Document constructor - create page directory """
     document = Document("files/4page.pdf", "tests2")
     self.assertTrue(os.path.isdir("tests2"))
     for i in range(1, 5):
         os.remove("tests2/4page" + str(i) + ".pdf")
         os.remove("tests2/4page" + str(i) + ".txt")
         os.remove("tests2/4page" + str(i) + ".json")
     os.removedirs("tests2")
Example #24
0
 def test_071(self):
     """ config - spell checker - norvig """
     with open("spell.txt", "w") as f:
         f.write("mispell speling similiar")
     Document.WORDDICT = 'norvig'
     document = Document('spell.txt')
     page = document[0]
     os.remove('spell.txt')
     os.remove('spell1.txt')
     os.remove('spell1.json')
Example #25
0
 def test_041(self):
     """ Document - invoice PDF """
     document = Document("files/invoice.pdf", "./")
     self.assertEqual(len(document), 1)
     self.assertTrue(os.path.isfile("invoice1.txt"))
     self.assertTrue(os.path.isfile("invoice1.pdf"))
     self.assertTrue(os.path.isfile("invoice1.json"))
     os.remove("invoice1.txt")
     os.remove("invoice1.pdf")
     os.remove("invoice1.json")
Example #26
0
 def test_059(self):
     """ config has multiple entries """
     document = Document("files/4page.pdf",
                         "./",
                         config=['bare', 'pos', 'roman'])
     self.assertTrue(document.bagOfWords != None)
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".json")
Example #27
0
 def test_040(self):
     """ Document - color PDF with overlay """
     document = Document("files/5page.pdf", "./")
     self.assertEqual(len(document), 5)
     for i in range(1, 6):
         self.assertTrue(os.path.isfile("5page" + str(i) + ".txt"))
         self.assertTrue(os.path.isfile("5page" + str(i) + ".json"))
     for i in range(1, 6):
         os.remove("5page" + str(i) + ".txt")
         os.remove("5page" + str(i) + ".pdf")
         os.remove("5page" + str(i) + ".json")
Example #28
0
 def test_065(self):
     """ config segment txt """
     document = Document('files/segment_para.txt', './', config=['segment'])
     self.assertEquals(document[0].size, 91)
     self.assertEquals(
         document[0].text,
         'This is a first paragraph\nand continues to next line.\n\nThen this is the second\nparagraph.'
     )
     self.assertEquals(len(document[0].words), 2)
     os.remove('segment_para1.txt')
     os.remove('segment_para1.json')
Example #29
0
 def test_022(self):
     """ Document text getter - PDF file """
     document = Document("files/4page.pdf", "./")
     self.assertEqual(document.text[0].strip()[0:6], "TIER 1")
     self.assertEqual(document.text[1].strip()[0:15], "COVERED MEDICAL")
     self.assertEqual(document.text[2].strip()[0:14], "Emergency mean")
     self.assertEqual(document.text[3].strip()[0:15], "Maximum Benefit")
     for i in range(1, 5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
Example #30
0
 def test_005(self):
     """ Document Constructor - document = valid text document """
     document = Document("test.txt")
     self.assertEqual(document.document, "test.txt")
     self.assertEqual(document.name, "test")
     self.assertEqual(len(document), 1)
     self.assertEqual(document.text, ["foo"])
     self.assertTrue(os.path.isfile("test1.txt"))
     self.assertTrue(os.path.isfile("test1.json"))
     os.remove("test1.txt")
     os.remove("test1.json")