def testPdfSearchNoResult(self):
     """Check search when there's no result"""
     pdf_processor = PdfProcessor(pdf_file_name)
     result = pdf_processor.search(query='camembert')
     self.assertEqual(result['max_reached'], 0)
     self.assertEqual(len(result['file_position']['results']),
                      0)  # number of search results
    def testPdfSearchBooleanAnd2(self):
        """Check regular boolean search with 2 terms"""
        pdf_processor = PdfProcessor(pdf_file_name)

        result1 = pdf_processor.search(query='multivio and context')
        self.assertEqual(result1['max_reached'], 0)
        n1 = len(result1['file_position']['results'])
        self.assertEqual(n1, 7)  # number of search results

        result2 = pdf_processor.search(query='context and multivio')
        self.assertEqual(result2['max_reached'], 0)
        n2 = len(result2['file_position']['results'])
        self.assertEqual(n2, n1)  # number of search results

        # both search results should be the same (but not necessarily in the same order)
        for res in result2['file_position']['results']:
            self.assertTrue(res in result1['file_position']['results'])

        # there should be no result on page 2
        result0 = pdf_processor.search(query='context and multivio',
                                       from_=2,
                                       to_=2)
        self.assertEqual(result0['max_reached'], 0)
        n0 = len(result0['file_position']['results'])
        self.assertEqual(n0, 0)  # number of search results
    def testPdfSearchRotation(self):
        """Check search results and context with different rotation angles"""
        pdf_processor = PdfProcessor(pdf_file_name)
        result1 = pdf_processor.search(query='user', angle= 0,  context_size=15)
        result2 = pdf_processor.search(query='user', angle= 90, context_size=15)
        result3 = pdf_processor.search(query='user', angle=180, context_size=15)
        result4 = pdf_processor.search(query='user', angle=270, context_size=15)

        # number of results should always be the same
        n1 = len(result1['file_position']['results'])
        n2 = len(result2['file_position']['results'])
        n3 = len(result3['file_position']['results'])
        n4 = len(result4['file_position']['results'])

        # context preview should be the same everywhere
        c1 = result1['file_position']['results'][0]['preview']
        c2 = result2['file_position']['results'][0]['preview']
        c3 = result3['file_position']['results'][0]['preview']
        c4 = result4['file_position']['results'][0]['preview']

        self.assertEqual(n1, n2)
        self.assertEqual(n2, n3)
        self.assertEqual(n3, n4)
        self.assertEqual(c1, c2)
        self.assertEqual(c2, c3)
        self.assertEqual(c3, c4)
 def testPdfGetPageIndexing(self):
     """Check page indexing"""
     pdf_processor = PdfProcessor(pdf_file_name)
     result = pdf_processor.get_indexing(index={'page_number':2})
     self.assertEqual(result.has_key('pages'), True)
     self.assertEqual(len(result['pages']), 1)
     # NOTE: 'pages' is a dictionary with page number as key (here this number is 2, as defined above)
     self.assertEqual(result['pages'][2].has_key('lines'), True)
     self.assertEqual(len(result['pages'][2]['lines']), 39) # number of lines on page
     self.assertEqual(len(result['pages'][2]['lines'][0]['x']), 13) # number of words on page
 def testPdfGetPageIndexing(self):
     """Check page indexing"""
     pdf_processor = PdfProcessor(pdf_file_name)
     result = pdf_processor.get_indexing(index={'page_number': 2})
     self.assertEqual(result.has_key('pages'), True)
     self.assertEqual(len(result['pages']), 1)
     # NOTE: 'pages' is a dictionary with page number as key (here this number is 2, as defined above)
     self.assertEqual(result['pages'][2].has_key('lines'), True)
     self.assertEqual(len(result['pages'][2]['lines']),
                      39)  # number of lines on page
     self.assertEqual(len(result['pages'][2]['lines'][0]['x']),
                      13)  # number of words on page
 def testPdfSearch(self):
     """Check regular search"""
     pdf_processor = PdfProcessor(pdf_file_name)
     result = pdf_processor.search(query='multivio', context_size=12)
     self.assertEqual(result.has_key('file_position'), True)
     self.assertEqual(len(result['file_position']['results']), 9) # number of search results
     first_result = result['file_position']['results'][0]['index']['bounding_box']
     self.assertAlmostEqual(first_result['x1'], 196)
     self.assertAlmostEqual(first_result['y1'], 166)
     self.assertAlmostEqual(first_result['x2'], 255)
     self.assertAlmostEqual(first_result['y2'], 181)
     # checking context preview. It should contain the searched term 'multivio'
     first_result = result['file_position']['results'][0]
     self.assertNotEqual(first_result['preview'].lower().find('multivio'), -1)
    def testPdfSearchMaxNumberOfResults(self):
        """Check search with a limit on the number of search results"""
        pdf_processor = PdfProcessor(pdf_file_name)
        result = pdf_processor.search(query='e', max_results=1)
        self.assertEqual(result['max_reached'], 1)
        self.assertEqual(len(result['file_position']['results']), 1) # number of search results

        result = pdf_processor.search(query='e', max_results=14)
        self.assertEqual(result['max_reached'], 14)
        self.assertEqual(len(result['file_position']['results']), 14) # number of search results

        result = pdf_processor.search(query='e', max_results=88)
        self.assertEqual(result['max_reached'], 88)
        self.assertEqual(len(result['file_position']['results']), 88) # number of search results
 def testPdfGetText(self):
     """Check text extraction"""
     pdf_processor = PdfProcessor(pdf_file_name)
     result = pdf_processor.get_text(
         index={
             'page_number': 1,
             'bounding_box': {
                 'x1': 147,
                 'x2': 239,
                 'y1': 260,
                 'y2': 277
             }
         })
     self.assertEqual(result.has_key('text'), True)
     self.assertEqual(result['text'], u'Introduction')
 def testPdfSearch(self):
     """Check regular search"""
     pdf_processor = PdfProcessor(pdf_file_name)
     result = pdf_processor.search(query='multivio', context_size=12)
     self.assertEqual(result.has_key('file_position'), True)
     self.assertEqual(len(result['file_position']['results']),
                      9)  # number of search results
     first_result = result['file_position']['results'][0]['index'][
         'bounding_box']
     self.assertAlmostEqual(first_result['x1'], 196)
     self.assertAlmostEqual(first_result['y1'], 166)
     self.assertAlmostEqual(first_result['x2'], 255)
     self.assertAlmostEqual(first_result['y2'], 181)
     # checking context preview. It should contain the searched term 'multivio'
     first_result = result['file_position']['results'][0]
     self.assertNotEqual(first_result['preview'].lower().find('multivio'),
                         -1)
    def testPdfSearchBooleanAndNone(self):
        """Check boolean with missing second term"""
        pdf_processor = PdfProcessor(pdf_file_name)

        result1 = pdf_processor.search(query='multivio and')
        self.assertEqual(result1['max_reached'], 0)
        n1 = len(result1['file_position']['results'])
        self.assertEqual(n1, 9)  # number of search results

        result2 = pdf_processor.search(query='multivio')
        self.assertEqual(result2['max_reached'], 0)
        n2 = len(result2['file_position']['results'])
        self.assertEqual(n2, n1)  # number of search results

        # both search results should be the same (but not necessarily in the same order)
        for res in result2['file_position']['results']:
            self.assertTrue(res in result1['file_position']['results'])
    def testPdfSearchMaxNumberOfResults(self):
        """Check search with a limit on the number of search results"""
        pdf_processor = PdfProcessor(pdf_file_name)
        result = pdf_processor.search(query='e', max_results=1)
        self.assertEqual(result['max_reached'], 1)
        self.assertEqual(len(result['file_position']['results']),
                         1)  # number of search results

        result = pdf_processor.search(query='e', max_results=14)
        self.assertEqual(result['max_reached'], 14)
        self.assertEqual(len(result['file_position']['results']),
                         14)  # number of search results

        result = pdf_processor.search(query='e', max_results=88)
        self.assertEqual(result['max_reached'], 88)
        self.assertEqual(len(result['file_position']['results']),
                         88)  # number of search results
Exemple #12
0
    def testPdfSearchBooleanAndNone(self):
        """Check boolean with missing second term"""
        pdf_processor = PdfProcessor(pdf_file_name)
        
        result1 = pdf_processor.search(query='multivio and')
        self.assertEqual(result1['max_reached'], 0)
        n1 = len(result1['file_position']['results'])
        self.assertEqual(n1, 9) # number of search results

        result2 = pdf_processor.search(query='multivio')
        self.assertEqual(result2['max_reached'], 0)
        n2 = len(result2['file_position']['results'])
        self.assertEqual(n2, n1) # number of search results

        # both search results should be the same (but not necessarily in the same order)
        for res in result2['file_position']['results']:
            self.assertTrue(res in result1['file_position']['results'])
    def testPdfSearchBooleanAnd34(self):
        """Check regular boolean search with 3 and 4 terms"""
        pdf_processor = PdfProcessor(pdf_file_name)

        ## 3 terms
        result1 = pdf_processor.search(
            query='multivio and context and introduction')
        self.assertEqual(result1['max_reached'], 0)
        n1 = len(result1['file_position']['results'])
        self.assertEqual(n1, 5)  # number of search results

        result2 = pdf_processor.search(
            query='context and multivio and introduction')
        self.assertEqual(result2['max_reached'], 0)
        n2 = len(result2['file_position']['results'])
        self.assertEqual(n1, n2)  # number of search results

        # both search results should be the same (but not necessarily in the same order)
        for res in result2['file_position']['results']:
            self.assertTrue(res in result1['file_position']['results'])

        # there should be no result on pages 2 and 3
        result0 = pdf_processor.search(
            query='context and multivio and introduction', from_=2, to_=3)
        self.assertEqual(result0['max_reached'], 0)
        n0 = len(result0['file_position']['results'])
        self.assertEqual(n0, 0)  # number of search results

        ## 4 terms
        result1 = pdf_processor.search(
            query='multivio and context and introduction and software')
        self.assertEqual(result1['max_reached'], 0)
        n1 = len(result1['file_position']['results'])
        self.assertEqual(n1, 9)  # number of search results

        result2 = pdf_processor.search(
            query='context and software and multivio and introduction')
        self.assertEqual(result2['max_reached'], 0)
        n2 = len(result2['file_position']['results'])
        self.assertEqual(n1, n2)  # number of search results

        # both search results should be the same (but not necessarily in the same order)
        for res in result2['file_position']['results']:
            self.assertTrue(res in result1['file_position']['results'])

        # there should be no result on pages 2 and 3
        result0 = pdf_processor.search(
            query='context and software and multivio and introduction',
            from_=2,
            to_=3)
        self.assertEqual(result0['max_reached'], 0)
        n0 = len(result0['file_position']['results'])
        self.assertEqual(n0, 0)  # number of search results
    def testPdfSearchRotation(self):
        """Check search results and context with different rotation angles"""
        pdf_processor = PdfProcessor(pdf_file_name)
        result1 = pdf_processor.search(query='user', angle=0, context_size=15)
        result2 = pdf_processor.search(query='user', angle=90, context_size=15)
        result3 = pdf_processor.search(query='user',
                                       angle=180,
                                       context_size=15)
        result4 = pdf_processor.search(query='user',
                                       angle=270,
                                       context_size=15)

        # number of results should always be the same
        n1 = len(result1['file_position']['results'])
        n2 = len(result2['file_position']['results'])
        n3 = len(result3['file_position']['results'])
        n4 = len(result4['file_position']['results'])

        # context preview should be the same everywhere
        c1 = result1['file_position']['results'][0]['preview']
        c2 = result2['file_position']['results'][0]['preview']
        c3 = result3['file_position']['results'][0]['preview']
        c4 = result4['file_position']['results'][0]['preview']

        self.assertEqual(n1, n2)
        self.assertEqual(n2, n3)
        self.assertEqual(n3, n4)
        self.assertEqual(c1, c2)
        self.assertEqual(c2, c3)
        self.assertEqual(c3, c4)
Exemple #15
0
    def testPdfSearchBooleanAnd2(self):
        """Check regular boolean search with 2 terms"""
        pdf_processor = PdfProcessor(pdf_file_name)
        
        result1 = pdf_processor.search(query='multivio and context')
        self.assertEqual(result1['max_reached'], 0)
        n1 = len(result1['file_position']['results'])
        self.assertEqual(n1, 7) # number of search results

        result2 = pdf_processor.search(query='context and multivio')
        self.assertEqual(result2['max_reached'], 0)
        n2 = len(result2['file_position']['results'])
        self.assertEqual(n2, n1) # number of search results

        # both search results should be the same (but not necessarily in the same order)
        for res in result2['file_position']['results']:
            self.assertTrue(res in result1['file_position']['results'])
        
        # there should be no result on page 2
        result0 = pdf_processor.search(query='context and multivio', from_=2, to_=2)
        self.assertEqual(result0['max_reached'], 0)
        n0 = len(result0['file_position']['results'])
        self.assertEqual(n0, 0) # number of search results
Exemple #16
0
    def testPdfSearchBooleanAnd34(self):
        """Check regular boolean search with 3 and 4 terms"""
        pdf_processor = PdfProcessor(pdf_file_name)
        
        
        ## 3 terms
        result1 = pdf_processor.search(query='multivio and context and introduction')
        self.assertEqual(result1['max_reached'], 0)
        n1 = len(result1['file_position']['results'])
        self.assertEqual(n1, 5) # number of search results

        result2 = pdf_processor.search(query='context and multivio and introduction')
        self.assertEqual(result2['max_reached'], 0)
        n2 = len(result2['file_position']['results'])
        self.assertEqual(n1, n2) # number of search results
        
        
        # both search results should be the same (but not necessarily in the same order)
        for res in result2['file_position']['results']:
            self.assertTrue(res in result1['file_position']['results'])
        
        # there should be no result on pages 2 and 3
        result0 = pdf_processor.search(query='context and multivio and introduction', from_=2, to_=3)
        self.assertEqual(result0['max_reached'], 0)
        n0 = len(result0['file_position']['results'])
        self.assertEqual(n0, 0) # number of search results
        
        
        ## 4 terms
        result1 = pdf_processor.search(query='multivio and context and introduction and software')
        self.assertEqual(result1['max_reached'], 0)
        n1 = len(result1['file_position']['results'])
        self.assertEqual(n1, 9) # number of search results

        result2 = pdf_processor.search(query='context and software and multivio and introduction')
        self.assertEqual(result2['max_reached'], 0)
        n2 = len(result2['file_position']['results'])
        self.assertEqual(n1, n2) # number of search results
        
        # both search results should be the same (but not necessarily in the same order)
        for res in result2['file_position']['results']:
            self.assertTrue(res in result1['file_position']['results'])

        # there should be no result on pages 2 and 3
        result0 = pdf_processor.search(query='context and software and multivio and introduction', from_=2, to_=3)
        self.assertEqual(result0['max_reached'], 0)
        n0 = len(result0['file_position']['results'])
        self.assertEqual(n0, 0) # number of search results        
Exemple #17
0
 def testPdfSearchNoResult(self):
     """Check search when there's no result"""
     pdf_processor = PdfProcessor(pdf_file_name)
     result = pdf_processor.search(query='camembert')
     self.assertEqual(result['max_reached'], 0)
     self.assertEqual(len(result['file_position']['results']), 0) # number of search results
 def testPdfProcessor(self):
     """Check PdfProcessor instance."""
     pdf_processor = PdfProcessor(pdf_file_name)
     self.assert_(pdf_processor, "Can not create simple Parser Object")
 def testPdfProcess(self):
     """Check Page extraction."""
     pdf_processor = PdfProcessor(pdf_file_name)
     (mime, content) = pdf_processor.render(index={'page_number': 1},
                                            max_output_size=(100, 100))
Exemple #20
0
 def testPdfGetText(self):
     """Check text extraction"""
     pdf_processor = PdfProcessor(pdf_file_name)
     result = pdf_processor.get_text(index={'page_number':1, 'bounding_box':{'x1':147,'x2':239,'y1':260,'y2':277}})
     self.assertEqual(result.has_key('text'), True)
     self.assertEqual(result['text'], u'Introduction')
Exemple #21
0
 def testPdfProcess(self):
     """Check Page extraction."""
     pdf_processor = PdfProcessor(pdf_file_name)
     (mime, content) = pdf_processor.render(index={'page_number':1},
     max_output_size = (100,100))