Example #1
0
    def ingest(self, file_path, entity):
        entity.schema = model.get('Table')
        with io.open(file_path, 'rb') as fh:
            encoding = self.detect_stream_encoding(fh)
            log.debug("Detected encoding [%r]: %s", entity, encoding)

        fh = io.open(file_path, 'r', encoding=encoding, errors='replace')
        try:
            sample = fh.read(4096 * 10)
            fh.seek(0)

            dialect = csv.Sniffer().sniff(sample)
            # dialect.delimiter = dialect.delimiter[0]
            has_header = csv.Sniffer().has_header(sample)

            reader = csv.reader(fh, dialect=dialect)
            rows = self.generate_rows(reader, has_header=has_header)
            self.emit_row_dicts(entity, rows)
        except UnicodeDecodeError as ude:
            log.warning("Encoding error: %r", entity)
            raise ProcessingException("Could not decode CSV (%s)" % encoding) from ude  # noqa
        except Exception as err:
            log.exception("CSV error: %s", err)
            raise ProcessingException("Invalid CSV: %s" % err) from err
        finally:
            fh.close()
Example #2
0
    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        self.ooxml_extract_metadata(file_path, entity)
        try:
            book = load_workbook(file_path, read_only=True)
        except Exception as err:
            raise ProcessingException("Invalid Excel file: %s" % err) from err

        try:
            for name in book.sheetnames:
                sheet = book[name]
                if not hasattr(sheet, "rows"):
                    log.warning("Cannot parse chart sheet: %s", name)
                    continue
                table = self.manager.make_entity("Table", parent=entity)
                table.make_id(entity.id, name)
                table.set("title", name)
                log.debug("Sheet: %s", name)
                self.emit_row_tuples(table, self.generate_rows(sheet))
                if table.has("csvHash"):
                    self.manager.emit_entity(table)
        except Exception as err:
            raise ProcessingException("Cannot read Excel file: %s" %
                                      err) from err
        finally:
            book.close()
Example #3
0
 def _document_to_pdf(self, file_path, entity):
     """Converts an office document to PDF."""
     if UNOSERVICE_URL is None:
         raise RuntimeError("No UNOSERVICE_URL for document conversion.")
     log.info('Converting [%s] to PDF...', entity.first('fileName'))
     file_name = entity.first('fileName') or 'data'
     mime_type = entity.first('mimeType') or DEFAULT
     attempt = 1
     for attempt in service_retries():
         fh = open(file_path, 'rb')
         try:
             files = {'file': (file_name, fh, mime_type)}
             res = requests.post(UNOSERVICE_URL,
                                 files=files,
                                 timeout=(5, 305),
                                 stream=True)
             if res.status_code > 399:
                 raise ProcessingException(res.text)
             out_path = self.make_work_file('out.pdf')
             with open(out_path, 'wb') as fh:
                 bytes_written = 0
                 for chunk in res.iter_content(chunk_size=None):
                     bytes_written += len(chunk)
                     fh.write(chunk)
                 if bytes_written > 50:
                     return out_path
         except RequestException as exc:
             log.error("Conversion failed: %s", exc)
             backoff(failures=attempt)
         finally:
             fh.close()
     raise ProcessingException("Document could not be converted to PDF.")
Example #4
0
 def _document_to_pdf(self, file_path, entity):
     """Converts an office document to PDF."""
     file_name = entity_filename(entity)
     mime_type = entity.first('mimeType')
     log.info('Converting [%s] to PDF...', file_name)
     for attempt in count(1):
         try:
             with open(file_path, 'rb') as fh:
                 files = {'file': (file_name, fh, mime_type)}
                 res = requests.post(CONVERT_URL,
                                     params={'timeout': CONVERT_TIMEOUT},
                                     files=files,
                                     timeout=CONVERT_TIMEOUT + 10,
                                     stream=True)
             res.raise_for_status()
             out_path = self.make_work_file('out.pdf')
             with open(out_path, 'wb') as fh:
                 bytes_written = 0
                 for chunk in res.iter_content(chunk_size=None):
                     bytes_written += len(chunk)
                     fh.write(chunk)
                 if bytes_written > 50:
                     return out_path
             raise ProcessingException("Could not be converted to PDF.")
         except HTTPError as exc:
             if exc.response.status_code == 400:
                 raise ProcessingException(res.text)
             msg = "Converter not availble: %s (attempt: %s)"
             log.info(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
         except RequestException as exc:
             msg = "Converter not availble: %s (attempt: %s)"
             log.error(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
Example #5
0
    def ingest(self, file_path):
        with io.open(file_path, 'rb') as fh:
            encoding = self.detect_stream_encoding(fh)
            log.debug("Detected encoding [%s]: %s", self.result, encoding)

        fh = io.open(file_path, 'r', encoding=encoding, errors='replace')
        try:
            sample = fh.read(4096 * 10)
            fh.seek(0)

            dialect = csv.Sniffer().sniff(sample)
            # dialect.delimiter = dialect.delimiter[0]
            has_header = csv.Sniffer().has_header(sample)

            reader = csv.reader(fh, dialect=dialect)
            rows = self.generate_rows(reader, has_header=has_header)
            self.result.flag(self.result.FLAG_TABULAR)
            self.result.emit_rows(rows)
        except UnicodeDecodeError as ude:
            log.warning("Encoding error: %s", self.result)
            raise ProcessingException("Could not decode CSV (%s)" % encoding)
        except Exception as err:
            log.exception("CSV error: %s", err)
            raise ProcessingException("Invalid CSV: %s" % err)
        finally:
            fh.close()
Example #6
0
    def unoconv_to_pdf(self, file_path, temp_dir):
        """Converts an office document to PDF."""
        if not self.is_unoconv_available():
            raise ConfigurationException("UNOSERVICE_URL is missing.")

        log.info('Converting [%s] to PDF...', self.result)
        file_name = os.path.basename(file_path)
        out_path = join_path(temp_dir, '%s.pdf' % file_name)
        for try_num in range(3):
            try:
                with open(file_path, 'rb') as fh:
                    data = {'format': 'pdf', 'doctype': 'document'}
                    files = {'file': (file_name, fh, self.UNO_MIME)}
                    # http://docs.python-requests.org/en/latest/user/advanced/#chunk-encoded-requests
                    res = self.unoconv_client.post(self.get_unoconv_url(),
                                                   data=data,
                                                   files=files,
                                                   timeout=300.0,
                                                   stream=True)
                length = 0
                with open(out_path, 'w') as fh:
                    for chunk in res.iter_content(chunk_size=None):
                        length += len(chunk)
                        fh.write(chunk)

                if length == 0:
                    raise ProcessingException("Could not convert to PDF.")
                return out_path
            except RequestException as re:
                log.exception(re)
                time.sleep(3**try_num)
        raise ProcessingException("Could not convert to PDF.")
Example #7
0
 def _document_to_pdf(self, file_path, result, work_path):
     """Converts an office document to PDF."""
     log.info('Converting [%s] to PDF...', result.file_name)
     out_path = os.path.basename(file_path)
     out_path = join_path(work_path, '%s.pdf' % out_path)
     file_name = result.file_name or 'data'
     mime_type = result.mime_type or DEFAULT
     attempt = 1
     for attempt in service_retries():
         fh = open(file_path, 'rb')
         try:
             files = {'file': (file_name, fh, mime_type)}
             res = requests.post(self.SERVICE_URL,
                                 files=files,
                                 timeout=(5, 305),
                                 stream=True)
             res.raise_for_status()
             with open(out_path, 'wb') as fh:
                 for chunk in res.iter_content(chunk_size=None):
                     fh.write(chunk)
             return out_path
         except RequestException as exc:
             if isinstance(exc, HTTPError):
                 if exc.response.status_code == 400:
                     raise ProcessingException(exc.response.text)
             log.error("Conversion failed: %s", exc)
             backoff(failures=attempt)
         finally:
             fh.close()
     raise ProcessingException("Document could not be converted to PDF.")
Example #8
0
 def make_work_file(self, file_name, prefix=None):
     if prefix is not None:
         prefix = ensure_path(prefix)
         if self.manager.work_path not in prefix.parents:
             raise ProcessingException("Path escalation: %r" % prefix)
     prefix = prefix or self.manager.work_path
     work_file = prefix.joinpath(file_name)
     if prefix not in work_file.parents:
         raise ProcessingException("Path escalation: %r" % file_name)
     if not work_file.parent.exists():
         work_file.parent.mkdir(parents=True, exist_ok=True)
     return work_file
Example #9
0
    def ingest(self, file_path):
        """Ingestor implementation."""
        file_size = self.result.size or os.path.getsize(file_path)
        if file_size > self.MAX_SIZE:
            raise ProcessingException("Text file is too large.")

        text = self.read_file_decoded(file_path)
        if text is None:
            raise ProcessingException("Document could not be decoded.")

        self.result.flag(self.result.FLAG_PLAINTEXT)
        self.extract_plain_text_content(text)
Example #10
0
    def ingest(self, file_path):
        self.result.flag(self.result.FLAG_EMAIL)
        try:
            doc = self.parse_xml(file_path)
        except TypeError:
            raise ProcessingException("Cannot parse OPF XML file.")

        if len(doc.findall('//email')) != 1:
            raise ProcessingException("More than one email in file.")

        email = doc.find('//email')
        props = email.getchildren()
        props = {c.tag: safe_string(c.text) for c in props if c.text}
        headers = {
            'Subject': props.get('OPFMessageCopySubject'),
            'Message-ID': props.pop('OPFMessageCopyMessageID', None),
            'From': self.get_contacts(email, 'OPFMessageCopyFromAddresses'),
            'Sender': self.get_contacts(email, 'OPFMessageCopySenderAddress'),
            'To': self.get_contacts(email, 'OPFMessageCopyToAddresses'),
            'CC': self.get_contacts(email, 'OPFMessageCopyCCAddresses'),
            'BCC': self.get_contacts(email, 'OPFMessageCopyBCCAddresses'),
        }
        date = props.get('OPFMessageCopySentTime')
        if date is not None:
            date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S')
            date = time.mktime(date.timetuple())
            headers['Date'] = utils.formatdate(date)

        self.result.headers = safe_dict(headers)

        self.update('title', props.pop('OPFMessageCopySubject', None))
        self.update('title', props.pop('OPFMessageCopyThreadTopic', None))
        for tag in ('OPFMessageCopyFromAddresses',
                    'OPFMessageCopySenderAddress'):
            self.update('author', self.get_contact_name(email, tag))

        self.update('summary', props.pop('OPFMessageCopyPreview', None))
        self.update('created_at', props.pop('OPFMessageCopySentTime', None))
        self.update('modified_at', props.pop('OPFMessageCopyModDate', None))

        body = props.pop('OPFMessageCopyBody', None)
        html = props.pop('OPFMessageCopyHTMLBody', None)

        has_html = '1E0' == props.pop('OPFMessageGetHasHTML', None)
        if has_html and safe_string(html):
            self.extract_html_content(html)
            self.result.flag(self.result.FLAG_HTML)
        else:
            self.extract_plain_text_content(body)
            self.result.flag(self.result.FLAG_PLAINTEXT)
Example #11
0
    def exec_command(self, command, *args):
        cmd = [self.find_command(command)]
        cmd.extend(args)

        try:
            retcode = subprocess.call(cmd,
                                      timeout=self.COMMAND_TIMEOUT,
                                      stdout=open(os.devnull, 'wb'))
        except (IOError, OSError) as ose:
            raise ProcessingException('Error: %s' % ose)
        except subprocess.TimeoutExpired:
            raise ProcessingException('Processing timed out.')

        if retcode != 0:
            raise ProcessingException('Failed: %s' % ' '.join(cmd))
Example #12
0
    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            msg = Message(file_path.as_posix())
        except Exception as exc:
            msg = "Cannot open message file: %s" % exc
            raise ProcessingException(msg) from exc

        self.extract_olefileio_metadata(msg, entity)

        try:
            self.extract_msg_headers(entity, msg.header)
        except Exception:
            log.exception("Cannot parse Outlook-stored headers")

        entity.add("subject", msg.subject)
        entity.add("threadTopic", msg.getStringField("0070"))
        entity.add("encoding", msg.encoding)
        entity.add("bodyText", msg.body)
        entity.add("bodyHtml", msg.htmlBody)
        entity.add("messageId", self.parse_message_ids(msg.message_id))

        if not entity.has("inReplyTo"):
            entity.add("inReplyTo", self.parse_references(msg.references, []))

        try:
            date = parsedate_to_datetime(msg.date).isoformat()
            entity.add("date", date)
        except Exception:
            log.warning("Could not parse date: %s", msg.date)

        # sender name and email
        sender = self.get_identities(msg.sender)
        self.apply_identities(entity, sender, "emitters", "sender")

        # received by
        sender = self.get_identity(msg.getStringField("0040"),
                                   msg.getStringField("0076"))
        self.apply_identities(entity, sender, "emitters")

        froms = self.get_identities(msg.getStringField("1046"))
        self.apply_identities(entity, froms, "emitters", "from")

        tos = self.get_identities(msg.to)
        self.apply_identities(entity, tos, "recipients", "to")

        ccs = self.get_identities(msg.cc)
        self.apply_identities(entity, ccs, "recipients", "cc")

        bccs = self.get_identities(msg.bcc)
        self.apply_identities(entity, bccs, "recipients", "bcc")

        self.resolve_message_ids(entity)
        for attachment in msg.attachments:
            if attachment.type != "data":
                continue
            name = stringify(attachment.longFilename)
            name = name or stringify(attachment.shortFilename)
            self.ingest_attachment(entity, name, attachment.type,
                                   attachment.data)
Example #13
0
    def ingest(self, file_path, result=None, ingestor_class=None):
        """Main execution step of an ingestor."""
        if result is None:
            result = self.RESULT_CLASS(file_path=file_path)

        self.checksum_file(result, file_path)
        self.before(result)
        result.status = Result.STATUS_PENDING
        try:
            if result.size is not None and result.size == 0:
                raise ProcessingException("Document is empty.")

            if ingestor_class is None:
                ingestor_class = self.auction(file_path, result)
                log.debug("Ingestor [%s, %s]: %s", result,
                          result.mime_type, ingestor_class.__name__)

            self.delegate(ingestor_class, result, file_path)
            result.status = Result.STATUS_SUCCESS
        except ProcessingException as pexc:
            result.error_message = stringify(pexc)
            result.status = Result.STATUS_FAILURE
            log.warning("Failed [%s]: %s", result, result.error_message)
        finally:
            if result.status == Result.STATUS_PENDING:
                result.status = Result.STATUS_STOPPED
            self.after(result)

        return result
Example #14
0
    def document_to_pdf(self, file_path, temp_dir):
        """Converts an office document to PDF."""
        if self.is_unoconv_available():
            return self.unoconv_to_pdf(file_path, temp_dir)

        instance_dir = join_path(temp_dir, 'soffice_instance')
        out_dir = join_path(temp_dir, 'soffice_output')
        make_directory(out_dir)
        log.info('Converting [%s] to PDF...', self.result)
        instance_dir = '-env:UserInstallation=file://{}'.format(instance_dir)
        self.exec_command('soffice',
                          instance_dir,
                          '--nofirststartwizard',
                          '--norestore',
                          '--nologo',
                          '--nodefault',
                          '--nolockcheck',
                          '--invisible',
                          '--headless',
                          '--convert-to', 'pdf',
                          '--outdir', out_dir,
                          file_path)

        for out_file in os.listdir(out_dir):
            return join_path(out_dir, out_file)

        msg = "Failed to convert to PDF: {}".format(file_path)
        raise ProcessingException(msg)
Example #15
0
    def ingest(self, file_path):
        self.ooxml_extract_metadata(file_path)
        try:
            book = load_workbook(file_path, read_only=True)
        except Exception as err:
            raise ProcessingException('Invalid Excel file: %s' % err)

        self.result.flag(self.result.FLAG_WORKBOOK)
        try:
            for name in book.sheetnames:
                rows = self.generate_csv(book[name])
                self.csv_child_iter(rows, name)
        except Exception as err:
            raise ProcessingException('Cannot read Excel file: %s' % err)
        finally:
            book.close()
Example #16
0
 def unpack_file(self, file_path, temp_file):
     try:
         with bz2.BZ2File(file_path) as src:
             with open(temp_file, "wb") as dst:
                 shutil.copyfileobj(src, dst)
     except IOError as ioe:
         raise ProcessingException("Error: %s" % ioe)
Example #17
0
    def ingest(self, file_path, entity):
        entity.schema = model.get('Email')
        try:
            msg = Message(file_path.as_posix())
        except Exception as exc:
            msg = "Cannot open message file: %s" % exc
            raise ProcessingException(msg) from exc

        self.extract_olefileio_metadata(msg, entity)

        try:
            self.extract_msg_headers(entity, msg.header)
        except Exception:
            log.exception("Cannot parse Outlook-stored headers")

        entity.add('subject', msg.subject)
        entity.add('threadTopic', msg.getStringField('0070'))
        entity.add('encoding', msg.encoding)
        entity.add('bodyText', msg.body)
        entity.add('bodyHtml', msg.htmlBody)
        entity.add('messageId', self.parse_message_ids(msg.message_id))

        if not entity.has('inReplyTo'):
            entity.add('inReplyTo', self.parse_references(msg.references, []))

        try:
            date = parsedate_to_datetime(msg.date).isoformat()
            entity.add('date', date)
        except Exception:
            log.warning("Could not parse date: %s", msg.date)

        # sender name and email
        sender = self.get_identities(msg.sender)
        self.apply_identities(entity, sender, 'emitters', 'sender')

        # received by
        sender = self.get_identity(msg.getStringField('0040'),
                                   msg.getStringField('0076'))
        self.apply_identities(entity, sender, 'emitters')

        froms = self.get_identities(msg.getStringField('1046'))
        self.apply_identities(entity, froms, 'emitters', 'from')

        tos = self.get_identities(msg.to)
        self.apply_identities(entity, tos, 'recipients', 'to')

        ccs = self.get_identities(msg.cc)
        self.apply_identities(entity, ccs, 'recipients', 'cc')

        bccs = self.get_identities(msg.bcc)
        self.apply_identities(entity, bccs, 'recipients', 'bcc')

        self.resolve_message_ids(entity)
        for attachment in msg.attachments:
            if attachment.type != 'data':
                continue
            name = stringify(attachment.longFilename)
            name = name or stringify(attachment.shortFilename)
            self.ingest_attachment(entity, name, attachment.type,
                                   attachment.data)
Example #18
0
    def ingest_message(self, data):
        try:
            msg = mime.from_string(data)
            if msg.headers is not None:
                self.extract_headers_metadata(msg.headers.items())
        except DecodingError as derr:
            raise ProcessingException('Cannot parse email: %s' % derr)

        try:
            if msg.subject:
                self.update('title', str(msg.subject))
        except DecodingError as derr:
            log.warning("Decoding subject: %s", derr)

        try:
            if msg.message_id:
                self.update('message_id', str(msg.message_id))
        except DecodingError as derr:
            log.warning("Decoding message ID: %s", derr)

        self.extract_plain_text_content(None)
        self.result.flag(self.result.FLAG_EMAIL)
        bodies = defaultdict(list)

        for part in msg.walk(with_self=True):
            try:
                if part.body is None:
                    continue
            except (DecodingError, ValueError) as de:
                log.warning("Cannot decode part [%s]: %s", self.result, de)
                continue

            file_name = part.detected_file_name

            # HACK HACK HACK - WTF flanker?
            # Disposition headers can have multiple filename declarations,
            # flanker decides to concatenate.
            if file_name is not None and len(file_name) > 4:
                half = len(file_name)//2
                if file_name[:half] == file_name[half:]:
                    file_name = file_name[:half]

            mime_type = str(part.detected_content_type)
            mime_type = normalize_mimetype(mime_type)

            if part.is_attachment():
                self.ingest_attachment(file_name,
                                       mime_type,
                                       part.body)

            if part.is_body():
                bodies[mime_type].append(part.body)

        if 'text/html' in bodies:
            self.extract_html_content('\n\n'.join(bodies['text/html']))
            self.result.flag(self.result.FLAG_HTML)

        if 'text/plain' in bodies:
            self.extract_plain_text_content('\n\n'.join(bodies['text/plain']))
            self.result.flag(self.result.FLAG_PLAINTEXT)
Example #19
0
    def ingest(self, file_path):
        self.extract_ole_metadata(file_path)
        try:
            book = xlrd.open_workbook(file_path, formatting_info=False)
        except Exception as err:
            raise ProcessingException('Invalid Excel file: %s' % err)

        self.result.flag(self.result.FLAG_WORKBOOK)
        try:
            for sheet in book.sheets():
                rows = self.generate_csv(sheet)
                self.csv_child_iter(rows, sheet.name)
        except XLRDError as err:
            raise ProcessingException('Invalid Excel file: %s' % err)
        finally:
            book.release_resources()
Example #20
0
 def ingest(self, file_path, entity):
     entity.schema = model.get('Table')
     try:
         table = Table(file_path.as_posix()).open()
         self.emit_row_dicts(entity, self.generate_rows(table))
     except DbfError as err:
         raise ProcessingException('Cannot open DBF file: %s' % err) from err  # noqa
Example #21
0
 def ingest(self, file_path, entity):
     try:
         entity.schema = model.get("Audio")
         metadata = MediaInfo.parse(file_path)
         for track in metadata.tracks:
             entity.add("title", track.title)
             entity.add("generator", track.writing_application)
             entity.add("generator", track.writing_library)
             entity.add("generator", track.publisher)
             entity.add(
                 "authoredAt", self.parse_timestamp(track.recorded_date)
             )  # noqa
             entity.add(
                 "authoredAt", self.parse_timestamp(track.tagged_date)
             )  # noqa
             entity.add(
                 "authoredAt", self.parse_timestamp(track.encoded_date)
             )  # noqa
             modified_at = self.parse_timestamp(
                 track.file_last_modification_date
             )  # noqa
             entity.add("modifiedAt", modified_at)
             if track.sampling_rate:
                 entity.add("samplingRate", track.sampling_rate)
             entity.add("duration", track.duration)
     except Exception as ex:
         raise ProcessingException("Could not read audio: %r", ex) from ex
Example #22
0
 def unpack_file(self, file_path, temp_file):
     try:
         with gzip.GzipFile(file_path) as src:
             with open(temp_file, 'wb') as dst:
                 shutil.copyfileobj(src, dst)
     except IOError as ioe:
         raise ProcessingException('Error: %s' % ioe)
Example #23
0
 def unpack(self, file_path, temp_dir):
     # FIXME: need to figure out how to unpack multi-part files.
     try:
         with rarfile.RarFile(file_path) as rf:
             self.unpack_members(rf, temp_dir)
     except rarfile.Error as err:
         raise ProcessingException('Invalid RAR file: %s' % err)
Example #24
0
 def ingest(self, file_path):
     self.result.flag(self.result.FLAG_TABULAR)
     try:
         table = Table(file_path).open()
         self.result.emit_rows(self.generate_rows(table))
     except DbfError as err:
         raise ProcessingException('Cannot open DBF file: %s' % err)
Example #25
0
    def ingest(self, file_path, entity):
        for file_size in entity.get("fileSize"):
            if int(file_size) > self.MAX_SIZE:
                raise ProcessingException("JSON file is too large.")

        with open(file_path, "rb") as fh:
            encoding = self.detect_stream_encoding(fh)

        with open(file_path, "r", encoding=encoding) as fh:
            try:
                data = json.load(fh)
                for idx, text in enumerate(self._collect_text(data)):
                    self.manager.emit_text_fragment(entity, [text], idx)
            except Exception as exc:
                raise ProcessingException("Cannot parse JSON file: %s" %
                                          exc) from exc
Example #26
0
 def parse_xml_path(self, file_path, **kwargs):
     if isinstance(file_path, Path):
         file_path = file_path.as_posix()
     try:
         parser = self.get_xml_parser(**kwargs)
         return etree.parse(file_path, parser)
     except (ParserError, ParseError, XMLSyntaxError) as exc:
         raise ProcessingException("Failed to parse XML: %s" % exc) from exc
Example #27
0
    def ingest(self, file_path):
        """Ingestor implementation."""
        file_size = self.result.size or os.path.getsize(file_path)
        if file_size > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

        try:
            doc = etree.parse(file_path)
        except (ParserError, ParseError):
            raise ProcessingException("XML could not be parsed.")

        text = self.extract_html_text(doc.getroot())
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text)
Example #28
0
    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            doc = self.parse_xml_path(file_path)
        except TypeError as te:
            raise ProcessingException("Cannot parse OPF XML file.") from te

        if len(doc.findall("//email")) != 1:
            raise ProcessingException("More than one email in file.")

        email = doc.find("//email")
        props = email.getchildren()
        props = {c.tag: stringify(c.text) for c in props if c.text}
        # from pprint import pformat
        # log.info(pformat(props))

        entity.add("subject", props.pop("OPFMessageCopySubject", None))
        entity.add("threadTopic", props.pop("OPFMessageCopyThreadTopic", None))
        entity.add("summary", props.pop("OPFMessageCopyPreview", None))
        # message IDs are already parsed, no need to clean prior:
        entity.add("messageId", props.pop("OPFMessageCopyMessageID", None))
        entity.add("date", self.get_date(props, "OPFMessageCopySentTime"))
        entity.add("modifiedAt", self.get_date(props, "OPFMessageCopyModDate"))

        senders = self.get_contacts(email, "OPFMessageCopySenderAddress")
        self.apply_identities(entity, senders, "emitters", "sender")

        froms = self.get_contacts(email, "OPFMessageCopyFromAddresses")
        self.apply_identities(entity, froms, "emitters", "from")

        tos = self.get_contacts(email, "OPFMessageCopyToAddresses")
        self.apply_identities(entity, tos, "recipients", "to")

        ccs = self.get_contacts(email, "OPFMessageCopyCCAddresses")
        self.apply_identities(entity, ccs, "recipients", "cc")

        bccs = self.get_contacts(email, "OPFMessageCopyBCCAddresses")
        self.apply_identities(entity, bccs, "recipients", "bcc")

        entity.add("bodyText", props.pop("OPFMessageCopyBody", None))
        html = props.pop("OPFMessageCopyHTMLBody", None)
        has_html = "1E0" == props.pop("OPFMessageGetHasHTML", None)
        if has_html and stringify(html):
            self.extract_html_content(entity, html, extract_metadata=False)

        self.resolve_message_ids(entity)
Example #29
0
    def exec_command(self, command, *args):
        binary = self.find_command(command)
        if binary is None:
            raise RuntimeError("Program not found: %s" % command)
        cmd = [binary]
        cmd.extend([path_string(a) for a in args])
        try:
            code = subprocess.call(cmd,
                                   timeout=self.COMMAND_TIMEOUT,
                                   stdout=open(os.devnull, 'wb'))
        except (IOError, OSError) as ose:
            raise ProcessingException('Error: %s' % ose) from ose
        except subprocess.TimeoutExpired as timeout:
            raise ProcessingException('Processing timed out.') from timeout

        if code != 0:
            raise ProcessingException('Failed: %s' % ' '.join(cmd))
Example #30
0
    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("HyperText")
        for file_size in entity.get("fileSize"):
            if int(file_size) > self.MAX_SIZE:
                raise ProcessingException("XML file is too large.")

        doc = self.parse_xml_path(file_path)
        text = self.extract_html_text(doc.getroot())
        entity.set("bodyText", text)
        try:
            transform = etree.XSLT(self.XSLT)
            html_doc = transform(doc)
            html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
            entity.set("bodyHtml", html_body)
        except ValueError as ve:
            raise ProcessingException("Error converting XML file: %s" % ve) from ve