Beispiel #1
0
def get_tika_content(d, a):
    """Call TIKA api for rmeta content.

    Calls the rmeta api from TIKA which extracts file metadata
    and content.
    Input:
        d: Mayan document API output
        a: Mayan authorization
    Output:
        c: Dictionary of document metadata and content
    """
    down_url = d['latest_version']['download_url']
    pages = get_resp(d['latest_version']['pages_url'], a)['results']
    try:
        page_no = get_page_num(pages[0])
    except:
        page_no = 0
    f = get_resp(down_url, a, True)
    try:
        c = remove_key_periods(unpack.from_buffer(f))
        c['page_no'] = page_no
        c['uuid'] = d['uuid']
        c['checksum'] = d['latest_version']['checksum']
        c['success'] = 1
    except:
        c = dict()
        c['page_no'] = 0
        c['uuid'] = d['uuid']
        c['checksum'] = d['latest_version']['checksum']
        c['success'] = 0
    return (c, f)
Beispiel #2
0
 def test_unpack_pdf_from_buffer(self):
     pfile = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
     with open(pfile, 'rb') as fp:
         buffer = fp.read()
         unpacked = unpack.from_buffer(buffer)
         self.assertIn("On the $5 menu, the consumer advisory is missing for eggs",unpacked['content'])
         self.assertTrue(unpacked['metadata'])
         self.assertFalse(unpacked['attachments'])
Beispiel #3
0
def get_tika_content_stream(f):
    """Call TIKA api for rmeta content.

    Calls the rmeta api from TIKA which extracts file metadata
    and content.
    Input:
        f: file stream
    Output:
        c: Dictionary of document metadata and content
    """
    try:
        c = remove_key_periods(unpack.from_buffer(f))
        c['success'] = 1
    except:
        c = dict()
        c['success'] = 0
    return c
Beispiel #4
0
 def test_ascii_frombuffer(self):
     parsed = unpack.from_buffer(self.text_ascii)
     self.assertEqual(parsed["content"].strip(), self.text_ascii)
Beispiel #5
0
 def test_utf8_frombuffer(self):
     parsed = unpack.from_buffer(self.text_utf8.encode('utf8'))
     self.assertEqual(parsed["content"].strip(), self.text_utf8)