Esempio n. 1
0
 def test_read_pdf_as_frame(self):
     path = os.path.join(os.path.dirname(__file__), "../_data")
     storage = Storage(path)
     url = "https://global.toyota/pages/global_toyota/ir/library/annual/2019_001_annual_jp.pdf"
     file_path = storage.download(url, "./test_pdf.pdf")
     reader = PDFReader()
     df = reader.read_to_frame(file_path)
     self.assertGreater(len(df), 1)
     os.remove(file_path)
Esempio n. 2
0
 def test_download(self):
     url = "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png"
     root = os.path.join(os.path.dirname(__file__), "./data")
     storage = Storage(root)
     path = storage.download(url, "raw/image.png")
     self.assertTrue(os.path.exists(path))
     correct_path = os.path.join(root, "raw/image.png")
     self.assertEqual(resolve(path), resolve(correct_path))
     os.remove(path)
Esempio n. 3
0
 def test_preprocess(self):
     path = os.path.join(os.path.dirname(__file__), "../_data")
     storage = Storage(path)
     url = "https://global.toyota/pages/global_toyota/ir/library/annual/2019_001_annual_en.pdf"
     file_path = storage.download(url, "./test_pdf_preprocess.pdf")
     reader = PDFReader()
     df = reader.read_to_frame(file_path)
     df = reader.preprocess_frame(df)
     df.to_csv("sample.csv", index=False)
     self.assertGreater(len(df), 1)
     os.remove(file_path)
Esempio n. 4
0
    def test_read_pdf_text(self):
        path = os.path.join(os.path.dirname(__file__), "../_data")
        storage = Storage(path)
        url = "https://global.toyota/pages/global_toyota/ir/library/annual/2019_001_annual_jp.pdf"
        file_path = storage.download(url, "./test_pdf.pdf")
        reader = PDFReader()
        text = reader.read(file_path, html=True)
        self.assertEqual(text[:6], "<html>")

        text = reader.read(file_path)
        self.assertTrue("目次" in text[:6])
        os.remove(file_path)