def get_tika_content(d, a): """Call TIKA api for rmeta content. Calls the rmeta api from TIKA which extracts file metadata and content. Input: d: Mayan document API output a: Mayan authorization Output: c: Dictionary of document metadata and content """ down_url = d['latest_version']['download_url'] pages = get_resp(d['latest_version']['pages_url'], a)['results'] try: page_no = get_page_num(pages[0]) except: page_no = 0 f = get_resp(down_url, a, True) try: c = remove_key_periods(unpack.from_buffer(f)) c['page_no'] = page_no c['uuid'] = d['uuid'] c['checksum'] = d['latest_version']['checksum'] c['success'] = 1 except: c = dict() c['page_no'] = 0 c['uuid'] = d['uuid'] c['checksum'] = d['latest_version']['checksum'] c['success'] = 0 return (c, f)
def test_unpack_pdf_from_buffer(self): pfile = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') with open(pfile, 'rb') as fp: buffer = fp.read() unpacked = unpack.from_buffer(buffer) self.assertIn("On the $5 menu, the consumer advisory is missing for eggs",unpacked['content']) self.assertTrue(unpacked['metadata']) self.assertFalse(unpacked['attachments'])
def get_tika_content_stream(f): """Call TIKA api for rmeta content. Calls the rmeta api from TIKA which extracts file metadata and content. Input: f: file stream Output: c: Dictionary of document metadata and content """ try: c = remove_key_periods(unpack.from_buffer(f)) c['success'] = 1 except: c = dict() c['success'] = 0 return c
def test_ascii_frombuffer(self): parsed = unpack.from_buffer(self.text_ascii) self.assertEqual(parsed["content"].strip(), self.text_ascii)
def test_utf8_frombuffer(self): parsed = unpack.from_buffer(self.text_utf8.encode('utf8')) self.assertEqual(parsed["content"].strip(), self.text_utf8)