Exemple #1
0
    def test_pdf_extraction(self):
        if not PdfFileReader:
            self.skipTest("pyPdf is not installed")

        pdf = open(os.path.join(my_path, 'testpdf.pdf'))
        extractor = BaseExtractor(pdf.read())
        self.assertEqual(extractor.get_title(), u'This is a test PDF file')

        expected_content = [
            'universal file format', 'fonts, formatting, colours and graphics',
            'regardless of the application and platform'
        ]

        content = extractor.get_content()
        for c in expected_content:
            self.assertTrue(c in content)
    def test_pdf_extraction(self):
        if not PdfFileReader:
            self.skipTest("pyPdf is not installed")
        
        pdf = open(os.path.join(my_path, 'testpdf.pdf'))
        extractor = BaseExtractor(pdf.read())
        self.assertEqual(extractor.get_title(), u'This is a test PDF file')

        expected_content = [
                            'universal file format',
                            'fonts, formatting, colours and graphics',
                            'regardless of the application and platform'
                            ]
        
        content = extractor.get_content()
        for c in expected_content:
            self.assertTrue(c in content)
    def test_extractor(self):
        extractor = BaseExtractor(self.html)
        self.assertEqual(extractor.get_title(), u'MY TITLE')
        
        expected_content = [
                            u'MY TITLE',
                            u'Relative Link',
                            u'Absolute Link',
                            u'Offsite Link',
                            u'THIS IS A H1 HEADER',
                            u'Content in a paragraph',
                            u'THIS IS A H2 HEADER',
                            u'Content In A Span',
                            ]
        content = extractor.get_content()
        for c in expected_content:
            self.assertTrue(c.lower() in content.lower())
        
        expected_headings = [
                            u'THIS IS A H1 HEADER',
                            u'THIS IS A H2 HEADER',
                             ]
        
        headings = extractor.get_headings()
        for h in expected_headings:
            self.assertTrue(h.lower() in headings.lower())
            
        unexpected_content = [  
                              'meta',
                              'Content-Type',
                              'stylesheet',
                              'script',
                              'style',
                              'relative_link',
                              'absolute_link',
                              'offsite_link',
                              'pagelocation',
                              'THIS IS A SCRIPT TAG',
                              'THIS_IS_A_STYLE_TAG'
                              
                              ]

        for u in unexpected_content:
            self.assertFalse(u in content)
            self.assertFalse(u in headings)
Exemple #4
0
    def test_extractor(self):
        extractor = BaseExtractor(self.html)
        self.assertEqual(extractor.get_title(), u'MY TITLE')

        expected_content = [
            u'MY TITLE',
            u'Relative Link',
            u'Absolute Link',
            u'Offsite Link',
            u'THIS IS A H1 HEADER',
            u'Content in a paragraph',
            u'THIS IS A H2 HEADER',
            u'Content In A Span',
        ]
        content = extractor.get_content()
        for c in expected_content:
            self.assertTrue(c.lower() in content.lower())

        expected_headings = [
            u'THIS IS A H1 HEADER',
            u'THIS IS A H2 HEADER',
        ]

        headings = extractor.get_headings()
        for h in expected_headings:
            self.assertTrue(h.lower() in headings.lower())

        unexpected_content = [
            'meta', 'Content-Type', 'stylesheet', 'script', 'style',
            'relative_link', 'absolute_link', 'offsite_link', 'pagelocation',
            'THIS IS A SCRIPT TAG', 'THIS_IS_A_STYLE_TAG'
        ]

        for u in unexpected_content:
            self.assertFalse(u in content)
            self.assertFalse(u in headings)