Exemple #1
0
 def test_ascii(self):
     with NamedTemporaryFile("w+t", prefix='tika-python', suffix='.txt', dir='/tmp') as f:
         f.write(self.text_ascii)
         f.flush()
         f.seek(0)
         parsed = unpack.from_file(f.name)
         self.assertEqual(parsed["content"].strip(), self.text_ascii)
Exemple #2
0
 def test_unpack_email_with_utf_chars_in_headers(self):
     # This test does not work on tika-python 1.24
     pfile = os.path.join(os.path.dirname(__file__), 'files', 'email_with_utf8chars_in_headers.eml')
     unpacked = unpack.from_file(pfile)
     mailsubject = 'Sending mails with non us-ascii characters in header (like greek or cyrillic characters - Γιάνης Βαρουφάκης & Гарри Каспаров) break Tika-Python'
     self.assertIn(mailsubject, unpacked['metadata']['subject'])
     self.assertIn(b'Multipart/alternative content', unpacked['attachments']['0.html'])
 def test_ascii(self):
     with NamedTemporaryFile("w+t", prefix='tika-python', suffix='.txt', dir='/tmp') as f:
         f.write(self.text_ascii)
         f.flush()
         f.seek(0)
         parsed = unpack.from_file(f.name)
         self.assertEqual(parsed["content"].strip(), self.text_ascii)
Exemple #4
0
def parse(path):
    try:
        parsed = unpack.from_file(path)  # unpack is faster in this case
        return preprocess(parsed['content'])
    except Exception as e:
        print('Exception while reading %s: %s' % (path, e))
        return None
    def __init__(self, db_file, key, number=20):
        """Initializes PDF reader to extract and interpret references of text.
        
        Args:
            db_file (string): file location of database file.
            key (string): in BetterBibTex format [authForeIni][authEtAl][year].
            number (integer): (TODO) number of refs to extract before stopping, if not all.
            """

        self.db_file = db_file
        self.key = key
        self.pdf = f'{key}.pdf'
        # TODO: self.number = number
        lit.Text(self.db_file, self.key)

        if os.path.isfile(os.path.join('bib_files', f'{self.pdf}')):
            self.txt = f"pdf2txt_{key}.txt"
            parsed = unpack.from_file(os.path.join('bib_files', self.pdf))
            with codecs.open(os.path.join('bib_files', self.txt), 'w',
                             'utf-8') as file:
                file.write(parsed['content'])
            self.refs()
            self.refs_parsed('y')
        else:
            print(f'Could not find {self.pdf}. Is it in the bib_files folder?')
Exemple #6
0
 def test_unpack_remotezip (self):
     from hashlib import md5
     remote_file='https://github.com/chrismattmann/tika-python/archive/1.24.zip'
     unpacked = unpack.from_file(remote_file)        
     self.assertEqual(
         md5(unpacked['attachments']['tika-python-1.24/LICENSE.txt']).hexdigest(),
         '3b83ef96387f14655fc854ddc3c6bd57'
     )
Exemple #7
0
 def test_unpack_email_no_utf_chars_in_headers(self):
     # Test that works on Tika-Python 1.24
     pfile = os.path.join(os.path.dirname(__file__), 'files', 'sample_email.eml')
     unpacked = unpack.from_file(pfile)
     # This file has multipart/mixed content and a SVG attachment 
     self.assertTrue(unpacked['content'])
     self.assertIn('Simple email with ascii7 characters and an attachment',
                   unpacked['metadata']['subject'])
     self.assertIn(b'Multipart/alternative content', unpacked['attachments']['0.html'])
Exemple #8
0
    def post(self):
        file = request.files['file']

        tmp_file = tempfile.NamedTemporaryFile()
        file.save(tmp_file)
        text = preprocess(unpack.from_file(tmp_file.name)['content'])
        tmp_file.close()

        return {'text': text, 'similar': get_similar(text, doc_for_api)}
Exemple #9
0
def similar_for_file():
    file = request.files['file']

    tmp_file = tempfile.NamedTemporaryFile()
    file.save(tmp_file)
    text = preprocess(unpack.from_file(tmp_file.name)['content'])
    tmp_file.close()

    similar = get_similar(text, lambda sim:
                          (sim, data_samples[sim], metadata[sim]))
    return render_template('doc.html', doc=text, idx=-1, **similar)
Exemple #10
0
	def extract_image_txt(self):
		cnt =0
		raw = unpack.from_file(self.file)
		images = raw['attachments']
		if images:
			for i in images.keys():
				cnt+=1
				save_path = 'tmp/'+str(i)
				open(save_path, 'wb').write(images[i])
				if not check_size(save_path):
					os.remove(save_path)
					cnt-=1
		if cnt>0:
			return (True, ['tmp/'+i for i in os.listdir('tmp/')])
		else:
			return (False, None)
Exemple #11
0
def get_tika_content(f):
    """Call TIKA api for rmeta content.

    Calls the rmeta api from TIKA which extracts file metadata
    and content.
    Input:
        f: file object
    Output:
        c: Dictionary of document metadata and content
    """
    try:
        c = remove_key_periods(unpack.from_file(f))
        c['success'] = 1
    except:
        c = dict()
        c['success'] = 0
    return c
def prepare_index_record(document_path, tika_url=TIKA_URL):
    """
    Prepares the record object (dict) after querying tika using the unpack.
    Unpack returns metadata and content in the response dict.

    Arguments:
        document_path {str}  -- Full Path to the document to be sent to tika
        tika_url {str}  -- (optional) full url to tika server
    """
    parsed = unpack.from_file(document_path, tika_url)
    metadata = parsed["metadata"]
    content = parsed["content"]
    title = "NoTitle"
    if title == "NoTitle":
        title = metadata.get("title", "NoTitle")
    elif title == "NoTitle":
        title = metadata.get("dc:title", "NoTitle")
    else:
        title = metadata.get("pdf.docinfo:title", "NoTitle")
    author = "NoAuthor"
    if author == "NoAuthor":
        author = metadata.get("Author", "NoAuthor")
    elif author == "NoAuthor":
        author = metadata.get("meta:author", "NoAuthor")
    
    subject = metadata.get("subject", "NoSubject")
    keywords = "NoKeywords"
    if keywords == "NoKeywords":
        keywords = metadata.get("Keywords", "NoKeywords")
    elif keywords == "NoKeywords":
        keywords = metadata.get("meta:keyword", "NoKeywords")
    elif keywords == "NoKeywords":
        keywords = metadata.get("pdf.docinfo:keywords", "NoKeywords")
    resourcename = metadata.get("resourceName", "NoResourceName")
    record = {
        "title": title,
        "description": subject,
        "author": author,
        "creation_date": metadata["Creation-Date"],
        "content_type": metadata["Content-Type"],
        "keywords": keywords,
        "num_pages": metadata["xmpTPg:NPages"],
        "filename": resourcename,
        "content": content
    }
    return record
Exemple #13
0
    def process_file(self, object_version, **kwargs):
        """Process the file with Tika."""
        fp = object_version.file.storage(**kwargs).open(mode=READ_MODE_BINARY)

        server_url = current_app.config['FILES_PROCESSOR_TIKA_SERVER_ENDPOINT']
        req_opts = current_app.config['FILES_PROCESSOR_TIKA_REQUEST_OPTIONS']

        try:
            result = unpack.from_file(
                fp,
                serverEndpoint=server_url,
                requestOptions=req_opts,
            )
        finally:
            fp.close()

        return result
Exemple #14
0
def get_index_data(file_path):
    try:
        result = unpack.from_file(file_path)
        result['status'] = 'succeded'
    except Exception as e:
        print(file_path)
        print(e.__class__)
        result['error'] = str(e)
        result = {'status': 'failed'}
    result['file_path'] = file_path
    result['last_scanned'] = datetime.datetime.now().isoformat()
    if 'attachments' in result and len(result['attachments'].keys()):
        result['attachments'] = {
            k: v
            for k, v in result['attachments'].items()
            if v.__class__ is not bytes
        }
    return result
Exemple #15
0
def extract(filepath):
    """
    De un archivo en filepath, extraer contenido, metadata e idioma.

    Parameters
    ----------
    filepath: str

    Returns
    -------
    dict ('contenido'(str), 'metadata'(dict), 'idioma'(str))
    """
    parsed = unpack.from_file(filepath)
    text = parsed.get('content')
    lang = language.from_buffer(text)
    metadata = parsed.get('metadata')
    info = dict(text=text, metadata=metadata, lang=lang)

    return info
for root, dirs, files in os.walk(STATEMENTS_FOLDER):

    if not output_dirs:

        # Make transaction dirs if they don't exist
        # * I have my statements saved in sub dirs by year so this creates those
        output_dirs = sorted([f"{TRANSACTIONS_FOLDER}/{d}" for d in dirs])
        for transaction_dir in output_dirs:
            if not os.path.isdir(transaction_dir):
                os.makedirs(transaction_dir)

    if files:
        for filename in files:
            path = f"{root}/{filename}"
            if os.path.splitext(path)[1] == ".pdf":
                contents = unpack.from_file(path).get("content", "")
                iterator = iter(re.split(f"({'|'.join(keywords)})", contents))

                file_data = []

                for key in iterator:
                    if key in keywords:

                        try:
                            value = next(iterator)

                            if key == TRANSACTIONS_HEADER:

                                # Split by the date format: "Jan 1, 1970"
                                # or 2 new lines
                                split = re.split(
Exemple #17
0
 def test_unpack_pdf_from_file(self):
     pfile = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
     unpacked = unpack.from_file(pfile)
     self.assertIn("On the $5 menu, the consumer advisory is missing for eggs",unpacked['content'])
     self.assertTrue(unpacked['metadata'])
     self.assertFalse(unpacked['attachments'])