def test_ascii(self): with NamedTemporaryFile("w+t", prefix='tika-python', suffix='.txt', dir='/tmp') as f: f.write(self.text_ascii) f.flush() f.seek(0) parsed = unpack.from_file(f.name) self.assertEqual(parsed["content"].strip(), self.text_ascii)
def test_unpack_email_with_utf_chars_in_headers(self): # This test does not work on tika-python 1.24 pfile = os.path.join(os.path.dirname(__file__), 'files', 'email_with_utf8chars_in_headers.eml') unpacked = unpack.from_file(pfile) mailsubject = 'Sending mails with non us-ascii characters in header (like greek or cyrillic characters - Γιάνης Βαρουφάκης & Гарри Каспаров) break Tika-Python' self.assertIn(mailsubject, unpacked['metadata']['subject']) self.assertIn(b'Multipart/alternative content', unpacked['attachments']['0.html'])
def parse(path): try: parsed = unpack.from_file(path) # unpack is faster in this case return preprocess(parsed['content']) except Exception as e: print('Exception while reading %s: %s' % (path, e)) return None
def __init__(self, db_file, key, number=20): """Initializes PDF reader to extract and interpret references of text. Args: db_file (string): file location of database file. key (string): in BetterBibTex format [authForeIni][authEtAl][year]. number (integer): (TODO) number of refs to extract before stopping, if not all. """ self.db_file = db_file self.key = key self.pdf = f'{key}.pdf' # TODO: self.number = number lit.Text(self.db_file, self.key) if os.path.isfile(os.path.join('bib_files', f'{self.pdf}')): self.txt = f"pdf2txt_{key}.txt" parsed = unpack.from_file(os.path.join('bib_files', self.pdf)) with codecs.open(os.path.join('bib_files', self.txt), 'w', 'utf-8') as file: file.write(parsed['content']) self.refs() self.refs_parsed('y') else: print(f'Could not find {self.pdf}. Is it in the bib_files folder?')
def test_unpack_remotezip (self): from hashlib import md5 remote_file='https://github.com/chrismattmann/tika-python/archive/1.24.zip' unpacked = unpack.from_file(remote_file) self.assertEqual( md5(unpacked['attachments']['tika-python-1.24/LICENSE.txt']).hexdigest(), '3b83ef96387f14655fc854ddc3c6bd57' )
def test_unpack_email_no_utf_chars_in_headers(self): # Test that works on Tika-Python 1.24 pfile = os.path.join(os.path.dirname(__file__), 'files', 'sample_email.eml') unpacked = unpack.from_file(pfile) # This file has multipart/mixed content and a SVG attachment self.assertTrue(unpacked['content']) self.assertIn('Simple email with ascii7 characters and an attachment', unpacked['metadata']['subject']) self.assertIn(b'Multipart/alternative content', unpacked['attachments']['0.html'])
def post(self): file = request.files['file'] tmp_file = tempfile.NamedTemporaryFile() file.save(tmp_file) text = preprocess(unpack.from_file(tmp_file.name)['content']) tmp_file.close() return {'text': text, 'similar': get_similar(text, doc_for_api)}
def similar_for_file(): file = request.files['file'] tmp_file = tempfile.NamedTemporaryFile() file.save(tmp_file) text = preprocess(unpack.from_file(tmp_file.name)['content']) tmp_file.close() similar = get_similar(text, lambda sim: (sim, data_samples[sim], metadata[sim])) return render_template('doc.html', doc=text, idx=-1, **similar)
def extract_image_txt(self): cnt =0 raw = unpack.from_file(self.file) images = raw['attachments'] if images: for i in images.keys(): cnt+=1 save_path = 'tmp/'+str(i) open(save_path, 'wb').write(images[i]) if not check_size(save_path): os.remove(save_path) cnt-=1 if cnt>0: return (True, ['tmp/'+i for i in os.listdir('tmp/')]) else: return (False, None)
def get_tika_content(f): """Call TIKA api for rmeta content. Calls the rmeta api from TIKA which extracts file metadata and content. Input: f: file object Output: c: Dictionary of document metadata and content """ try: c = remove_key_periods(unpack.from_file(f)) c['success'] = 1 except: c = dict() c['success'] = 0 return c
def prepare_index_record(document_path, tika_url=TIKA_URL): """ Prepares the record object (dict) after querying tika using the unpack. Unpack returns metadata and content in the response dict. Arguments: document_path {str} -- Full Path to the document to be sent to tika tika_url {str} -- (optional) full url to tika server """ parsed = unpack.from_file(document_path, tika_url) metadata = parsed["metadata"] content = parsed["content"] title = "NoTitle" if title == "NoTitle": title = metadata.get("title", "NoTitle") elif title == "NoTitle": title = metadata.get("dc:title", "NoTitle") else: title = metadata.get("pdf.docinfo:title", "NoTitle") author = "NoAuthor" if author == "NoAuthor": author = metadata.get("Author", "NoAuthor") elif author == "NoAuthor": author = metadata.get("meta:author", "NoAuthor") subject = metadata.get("subject", "NoSubject") keywords = "NoKeywords" if keywords == "NoKeywords": keywords = metadata.get("Keywords", "NoKeywords") elif keywords == "NoKeywords": keywords = metadata.get("meta:keyword", "NoKeywords") elif keywords == "NoKeywords": keywords = metadata.get("pdf.docinfo:keywords", "NoKeywords") resourcename = metadata.get("resourceName", "NoResourceName") record = { "title": title, "description": subject, "author": author, "creation_date": metadata["Creation-Date"], "content_type": metadata["Content-Type"], "keywords": keywords, "num_pages": metadata["xmpTPg:NPages"], "filename": resourcename, "content": content } return record
def process_file(self, object_version, **kwargs): """Process the file with Tika.""" fp = object_version.file.storage(**kwargs).open(mode=READ_MODE_BINARY) server_url = current_app.config['FILES_PROCESSOR_TIKA_SERVER_ENDPOINT'] req_opts = current_app.config['FILES_PROCESSOR_TIKA_REQUEST_OPTIONS'] try: result = unpack.from_file( fp, serverEndpoint=server_url, requestOptions=req_opts, ) finally: fp.close() return result
def get_index_data(file_path): try: result = unpack.from_file(file_path) result['status'] = 'succeded' except Exception as e: print(file_path) print(e.__class__) result['error'] = str(e) result = {'status': 'failed'} result['file_path'] = file_path result['last_scanned'] = datetime.datetime.now().isoformat() if 'attachments' in result and len(result['attachments'].keys()): result['attachments'] = { k: v for k, v in result['attachments'].items() if v.__class__ is not bytes } return result
def extract(filepath): """ De un archivo en filepath, extraer contenido, metadata e idioma. Parameters ---------- filepath: str Returns ------- dict ('contenido'(str), 'metadata'(dict), 'idioma'(str)) """ parsed = unpack.from_file(filepath) text = parsed.get('content') lang = language.from_buffer(text) metadata = parsed.get('metadata') info = dict(text=text, metadata=metadata, lang=lang) return info
for root, dirs, files in os.walk(STATEMENTS_FOLDER): if not output_dirs: # Make transaction dirs if they don't exist # * I have my statements saved in sub dirs by year so this creates those output_dirs = sorted([f"{TRANSACTIONS_FOLDER}/{d}" for d in dirs]) for transaction_dir in output_dirs: if not os.path.isdir(transaction_dir): os.makedirs(transaction_dir) if files: for filename in files: path = f"{root}/{filename}" if os.path.splitext(path)[1] == ".pdf": contents = unpack.from_file(path).get("content", "") iterator = iter(re.split(f"({'|'.join(keywords)})", contents)) file_data = [] for key in iterator: if key in keywords: try: value = next(iterator) if key == TRANSACTIONS_HEADER: # Split by the date format: "Jan 1, 1970" # or 2 new lines split = re.split(
def test_unpack_pdf_from_file(self): pfile = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') unpacked = unpack.from_file(pfile) self.assertIn("On the $5 menu, the consumer advisory is missing for eggs",unpacked['content']) self.assertTrue(unpacked['metadata']) self.assertFalse(unpacked['attachments'])