Beispiel #1
0
    def test_pdf_extraction(self):
        if not PdfFileReader:
            self.skipTest("pyPdf is not installed")

        pdf = open(os.path.join(my_path, 'testpdf.pdf'))
        extractor = BaseExtractor(pdf.read())
        self.assertEqual(extractor.get_title(), u'This is a test PDF file')

        expected_content = [
            'universal file format', 'fonts, formatting, colours and graphics',
            'regardless of the application and platform'
        ]

        content = extractor.get_content()
        for c in expected_content:
            self.assertTrue(c in content)
Beispiel #2
0
    def test_extractor(self):
        extractor = BaseExtractor(self.html)
        self.assertEqual(extractor.get_title(), u'MY TITLE')

        expected_content = [
            u'MY TITLE',
            u'Relative Link',
            u'Absolute Link',
            u'Offsite Link',
            u'THIS IS A H1 HEADER',
            u'Content in a paragraph',
            u'THIS IS A H2 HEADER',
            u'Content In A Span',
        ]
        content = extractor.get_content()
        for c in expected_content:
            self.assertTrue(c.lower() in content.lower())

        expected_headings = [
            u'THIS IS A H1 HEADER',
            u'THIS IS A H2 HEADER',
        ]

        headings = extractor.get_headings()
        for h in expected_headings:
            self.assertTrue(h.lower() in headings.lower())

        unexpected_content = [
            'meta', 'Content-Type', 'stylesheet', 'script', 'style',
            'relative_link', 'absolute_link', 'offsite_link', 'pagelocation',
            'THIS IS A SCRIPT TAG', 'THIS_IS_A_STYLE_TAG'
        ]

        for u in unexpected_content:
            self.assertFalse(u in content)
            self.assertFalse(u in headings)