def test_document_find_simple(simple_document_path): """Test search capabilities.""" pdf = PDF(simple_document_path, 1) pdf.load() resp = pdf.find_text_page('Multivio') assert resp == [{ 'BBox': { 'x1': 196.189, 'x2': 254.84358934, 'y1': 165.65651239999994, 'y2': 180.96100299999992 }, 'page': 1, 'text': "Multivio: Project description" }, { 'BBox': { 'x1': 124.80199999999999, 'y1': 287.6239556, 'x2': 161.88678224000006, 'y2': 296.4707444 }, 'page': 1, 'text': 'Multivio is an Internet-based application ' + 'for browsing and accessing digital doc-' }]
def extract_text(file, outfilename=None): """Extract fulltext from a given pdf file.""" from invenio_multivio.pdf.api import PDF text = [] try: pdf = PDF(path=file) pdf.load() text = pdf.get_text_page() # doc = slate.PDF(file) # doc = PyPDF2.PdfFileReader(file) # if doc.isEncrypted: # warning('file is encrypted') # return [] # text = [] # for np in range(doc.getNumPages()): # page = doc.getPage(np) # text.append(page.extractText()) except Exception: error('text generation failed') pass if not text: warning('%s: do not contains text' % file) return text if outfilename: with open(outfilename, 'wb') as of: return of.write(bytes(" ".join(text), 'utf-8')) return text
def test_document_no_toc(document_no_toc_path, json_toc_res): """Test search capabilities.""" pdf = PDF(document_no_toc_path) pdf.load() resp = pdf.get_toc() if resp is None: assert True else: assert False
def test_document_metadata(simple_document_path): """Test search capabilities.""" pdf = PDF(simple_document_path) pdf.load() resp = pdf.get_metadata() assert resp == { 'creator': 'Miguel Moreira', 'nativeSize': ((595.2760000000001, 841.89), {}), 'fileSize': 70909, 'mime': 'application/pdf', 'nPages': 3, 'title': 'Multivio: Project description' }
def test_document_render(simple_document_path): """Test search capabilities.""" pdf = PDF(simple_document_path, 2) pdf.load() pdf.render_page(pdf.get_width(), pdf.get_height()) assert pdf assert round(pdf.get_scale(), 2) == 1 assert pdf.get_width() == 595 assert pdf.get_height() == 841 n_bytes = len(pdf.jpeg.read()) # cannot check extact value as it depends on the poppler version assert n_bytes > 100 pdf.rotate(90) assert pdf.pil_image.size[0] == 841 assert pdf.pil_image.size[1] == 595
def test_document_find(simple_document_path): """Test search capabilities.""" pdf = PDF(simple_document_path, 1) pdf.load() resp = pdf.find_text_page(str('Multivio is')) assert resp == [{ 'BBox': { 'x1': 124.80199999999999, 'x2': 171.2336935600001, 'y1': 287.6239556, 'y2': 296.4707444 }, 'page': 1, 'text': 'Multivio is an Internet-based ' + 'application for browsing and accessing digital doc-' }]
def generate_thumbnail(filename, outfilename=None): """Generate a thumnail for a given pdf filename.""" # img = Image(filename=filename+'[0]', resolution=20) # try: # img.alpha_channel = 'off' # img.transform(resize='150x150>') from invenio_multivio.pdf.api import PDF try: pdf = PDF(path=filename, page_nr=0) pdf.load() img = pdf.render_page(max_width=80, max_height=80) except Exception: error('image generation failed') return None if outfilename: return img.save(filename=outfilename) return img
def test_document_indexing(simple_document_path): """Test search capabilities.""" pdf = PDF(simple_document_path, 1) pdf.load() resp = pdf.get_indexing() assert resp == "NotImplemented"
def test_document_sizes(simple_document_path): """Test search capabilities.""" pdf = PDF(simple_document_path, 1) pdf.load() resp = pdf.get_sizes() assert resp == {'height': 841, 'width': 595}
def test_document_toc(simple_document_path, json_toc_res): """Test search capabilities.""" pdf = PDF(simple_document_path) pdf.load() resp = pdf.get_toc() assert resp == json_toc_res
def test_document_find_max_result(simple_document_path): """Test search capabilities.""" pdf = PDF(simple_document_path, 1) pdf.load() resp = pdf.find_text_page('a') assert len(resp) == 127
def test_document_text(simple_document_path, text_page): """Test search capabilities.""" pdf = PDF(simple_document_path, 1) pdf.load() resp = pdf.get_text_page() assert resp == text_page