def ingest(self, file_path, entity): entity.schema = model.get('Table') with io.open(file_path, 'rb') as fh: encoding = self.detect_stream_encoding(fh) log.debug("Detected encoding [%r]: %s", entity, encoding) fh = io.open(file_path, 'r', encoding=encoding, errors='replace') try: sample = fh.read(4096 * 10) fh.seek(0) dialect = csv.Sniffer().sniff(sample) # dialect.delimiter = dialect.delimiter[0] has_header = csv.Sniffer().has_header(sample) reader = csv.reader(fh, dialect=dialect) rows = self.generate_rows(reader, has_header=has_header) self.emit_row_dicts(entity, rows) except UnicodeDecodeError as ude: log.warning("Encoding error: %r", entity) raise ProcessingException("Could not decode CSV (%s)" % encoding) from ude # noqa except Exception as err: log.exception("CSV error: %s", err) raise ProcessingException("Invalid CSV: %s" % err) from err finally: fh.close()
def ingest(self, file_path, entity): entity.schema = model.get("Workbook") self.ooxml_extract_metadata(file_path, entity) try: book = load_workbook(file_path, read_only=True) except Exception as err: raise ProcessingException("Invalid Excel file: %s" % err) from err try: for name in book.sheetnames: sheet = book[name] if not hasattr(sheet, "rows"): log.warning("Cannot parse chart sheet: %s", name) continue table = self.manager.make_entity("Table", parent=entity) table.make_id(entity.id, name) table.set("title", name) log.debug("Sheet: %s", name) self.emit_row_tuples(table, self.generate_rows(sheet)) if table.has("csvHash"): self.manager.emit_entity(table) except Exception as err: raise ProcessingException("Cannot read Excel file: %s" % err) from err finally: book.close()
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" if UNOSERVICE_URL is None: raise RuntimeError("No UNOSERVICE_URL for document conversion.") log.info('Converting [%s] to PDF...', entity.first('fileName')) file_name = entity.first('fileName') or 'data' mime_type = entity.first('mimeType') or DEFAULT attempt = 1 for attempt in service_retries(): fh = open(file_path, 'rb') try: files = {'file': (file_name, fh, mime_type)} res = requests.post(UNOSERVICE_URL, files=files, timeout=(5, 305), stream=True) if res.status_code > 399: raise ProcessingException(res.text) out_path = self.make_work_file('out.pdf') with open(out_path, 'wb') as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path except RequestException as exc: log.error("Conversion failed: %s", exc) backoff(failures=attempt) finally: fh.close() raise ProcessingException("Document could not be converted to PDF.")
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" file_name = entity_filename(entity) mime_type = entity.first('mimeType') log.info('Converting [%s] to PDF...', file_name) for attempt in count(1): try: with open(file_path, 'rb') as fh: files = {'file': (file_name, fh, mime_type)} res = requests.post(CONVERT_URL, params={'timeout': CONVERT_TIMEOUT}, files=files, timeout=CONVERT_TIMEOUT + 10, stream=True) res.raise_for_status() out_path = self.make_work_file('out.pdf') with open(out_path, 'wb') as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path raise ProcessingException("Could not be converted to PDF.") except HTTPError as exc: if exc.response.status_code == 400: raise ProcessingException(res.text) msg = "Converter not availble: %s (attempt: %s)" log.info(msg, exc, attempt) backoff(failures=math.sqrt(attempt)) except RequestException as exc: msg = "Converter not availble: %s (attempt: %s)" log.error(msg, exc, attempt) backoff(failures=math.sqrt(attempt))
def ingest(self, file_path): with io.open(file_path, 'rb') as fh: encoding = self.detect_stream_encoding(fh) log.debug("Detected encoding [%s]: %s", self.result, encoding) fh = io.open(file_path, 'r', encoding=encoding, errors='replace') try: sample = fh.read(4096 * 10) fh.seek(0) dialect = csv.Sniffer().sniff(sample) # dialect.delimiter = dialect.delimiter[0] has_header = csv.Sniffer().has_header(sample) reader = csv.reader(fh, dialect=dialect) rows = self.generate_rows(reader, has_header=has_header) self.result.flag(self.result.FLAG_TABULAR) self.result.emit_rows(rows) except UnicodeDecodeError as ude: log.warning("Encoding error: %s", self.result) raise ProcessingException("Could not decode CSV (%s)" % encoding) except Exception as err: log.exception("CSV error: %s", err) raise ProcessingException("Invalid CSV: %s" % err) finally: fh.close()
def unoconv_to_pdf(self, file_path, temp_dir): """Converts an office document to PDF.""" if not self.is_unoconv_available(): raise ConfigurationException("UNOSERVICE_URL is missing.") log.info('Converting [%s] to PDF...', self.result) file_name = os.path.basename(file_path) out_path = join_path(temp_dir, '%s.pdf' % file_name) for try_num in range(3): try: with open(file_path, 'rb') as fh: data = {'format': 'pdf', 'doctype': 'document'} files = {'file': (file_name, fh, self.UNO_MIME)} # http://docs.python-requests.org/en/latest/user/advanced/#chunk-encoded-requests res = self.unoconv_client.post(self.get_unoconv_url(), data=data, files=files, timeout=300.0, stream=True) length = 0 with open(out_path, 'w') as fh: for chunk in res.iter_content(chunk_size=None): length += len(chunk) fh.write(chunk) if length == 0: raise ProcessingException("Could not convert to PDF.") return out_path except RequestException as re: log.exception(re) time.sleep(3**try_num) raise ProcessingException("Could not convert to PDF.")
def _document_to_pdf(self, file_path, result, work_path): """Converts an office document to PDF.""" log.info('Converting [%s] to PDF...', result.file_name) out_path = os.path.basename(file_path) out_path = join_path(work_path, '%s.pdf' % out_path) file_name = result.file_name or 'data' mime_type = result.mime_type or DEFAULT attempt = 1 for attempt in service_retries(): fh = open(file_path, 'rb') try: files = {'file': (file_name, fh, mime_type)} res = requests.post(self.SERVICE_URL, files=files, timeout=(5, 305), stream=True) res.raise_for_status() with open(out_path, 'wb') as fh: for chunk in res.iter_content(chunk_size=None): fh.write(chunk) return out_path except RequestException as exc: if isinstance(exc, HTTPError): if exc.response.status_code == 400: raise ProcessingException(exc.response.text) log.error("Conversion failed: %s", exc) backoff(failures=attempt) finally: fh.close() raise ProcessingException("Document could not be converted to PDF.")
def make_work_file(self, file_name, prefix=None): if prefix is not None: prefix = ensure_path(prefix) if self.manager.work_path not in prefix.parents: raise ProcessingException("Path escalation: %r" % prefix) prefix = prefix or self.manager.work_path work_file = prefix.joinpath(file_name) if prefix not in work_file.parents: raise ProcessingException("Path escalation: %r" % file_name) if not work_file.parent.exists(): work_file.parent.mkdir(parents=True, exist_ok=True) return work_file
def ingest(self, file_path): """Ingestor implementation.""" file_size = self.result.size or os.path.getsize(file_path) if file_size > self.MAX_SIZE: raise ProcessingException("Text file is too large.") text = self.read_file_decoded(file_path) if text is None: raise ProcessingException("Document could not be decoded.") self.result.flag(self.result.FLAG_PLAINTEXT) self.extract_plain_text_content(text)
def ingest(self, file_path): self.result.flag(self.result.FLAG_EMAIL) try: doc = self.parse_xml(file_path) except TypeError: raise ProcessingException("Cannot parse OPF XML file.") if len(doc.findall('//email')) != 1: raise ProcessingException("More than one email in file.") email = doc.find('//email') props = email.getchildren() props = {c.tag: safe_string(c.text) for c in props if c.text} headers = { 'Subject': props.get('OPFMessageCopySubject'), 'Message-ID': props.pop('OPFMessageCopyMessageID', None), 'From': self.get_contacts(email, 'OPFMessageCopyFromAddresses'), 'Sender': self.get_contacts(email, 'OPFMessageCopySenderAddress'), 'To': self.get_contacts(email, 'OPFMessageCopyToAddresses'), 'CC': self.get_contacts(email, 'OPFMessageCopyCCAddresses'), 'BCC': self.get_contacts(email, 'OPFMessageCopyBCCAddresses'), } date = props.get('OPFMessageCopySentTime') if date is not None: date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S') date = time.mktime(date.timetuple()) headers['Date'] = utils.formatdate(date) self.result.headers = safe_dict(headers) self.update('title', props.pop('OPFMessageCopySubject', None)) self.update('title', props.pop('OPFMessageCopyThreadTopic', None)) for tag in ('OPFMessageCopyFromAddresses', 'OPFMessageCopySenderAddress'): self.update('author', self.get_contact_name(email, tag)) self.update('summary', props.pop('OPFMessageCopyPreview', None)) self.update('created_at', props.pop('OPFMessageCopySentTime', None)) self.update('modified_at', props.pop('OPFMessageCopyModDate', None)) body = props.pop('OPFMessageCopyBody', None) html = props.pop('OPFMessageCopyHTMLBody', None) has_html = '1E0' == props.pop('OPFMessageGetHasHTML', None) if has_html and safe_string(html): self.extract_html_content(html) self.result.flag(self.result.FLAG_HTML) else: self.extract_plain_text_content(body) self.result.flag(self.result.FLAG_PLAINTEXT)
def exec_command(self, command, *args): cmd = [self.find_command(command)] cmd.extend(args) try: retcode = subprocess.call(cmd, timeout=self.COMMAND_TIMEOUT, stdout=open(os.devnull, 'wb')) except (IOError, OSError) as ose: raise ProcessingException('Error: %s' % ose) except subprocess.TimeoutExpired: raise ProcessingException('Processing timed out.') if retcode != 0: raise ProcessingException('Failed: %s' % ' '.join(cmd))
def ingest(self, file_path, entity): entity.schema = model.get("Email") try: msg = Message(file_path.as_posix()) except Exception as exc: msg = "Cannot open message file: %s" % exc raise ProcessingException(msg) from exc self.extract_olefileio_metadata(msg, entity) try: self.extract_msg_headers(entity, msg.header) except Exception: log.exception("Cannot parse Outlook-stored headers") entity.add("subject", msg.subject) entity.add("threadTopic", msg.getStringField("0070")) entity.add("encoding", msg.encoding) entity.add("bodyText", msg.body) entity.add("bodyHtml", msg.htmlBody) entity.add("messageId", self.parse_message_ids(msg.message_id)) if not entity.has("inReplyTo"): entity.add("inReplyTo", self.parse_references(msg.references, [])) try: date = parsedate_to_datetime(msg.date).isoformat() entity.add("date", date) except Exception: log.warning("Could not parse date: %s", msg.date) # sender name and email sender = self.get_identities(msg.sender) self.apply_identities(entity, sender, "emitters", "sender") # received by sender = self.get_identity(msg.getStringField("0040"), msg.getStringField("0076")) self.apply_identities(entity, sender, "emitters") froms = self.get_identities(msg.getStringField("1046")) self.apply_identities(entity, froms, "emitters", "from") tos = self.get_identities(msg.to) self.apply_identities(entity, tos, "recipients", "to") ccs = self.get_identities(msg.cc) self.apply_identities(entity, ccs, "recipients", "cc") bccs = self.get_identities(msg.bcc) self.apply_identities(entity, bccs, "recipients", "bcc") self.resolve_message_ids(entity) for attachment in msg.attachments: if attachment.type != "data": continue name = stringify(attachment.longFilename) name = name or stringify(attachment.shortFilename) self.ingest_attachment(entity, name, attachment.type, attachment.data)
def ingest(self, file_path, result=None, ingestor_class=None): """Main execution step of an ingestor.""" if result is None: result = self.RESULT_CLASS(file_path=file_path) self.checksum_file(result, file_path) self.before(result) result.status = Result.STATUS_PENDING try: if result.size is not None and result.size == 0: raise ProcessingException("Document is empty.") if ingestor_class is None: ingestor_class = self.auction(file_path, result) log.debug("Ingestor [%s, %s]: %s", result, result.mime_type, ingestor_class.__name__) self.delegate(ingestor_class, result, file_path) result.status = Result.STATUS_SUCCESS except ProcessingException as pexc: result.error_message = stringify(pexc) result.status = Result.STATUS_FAILURE log.warning("Failed [%s]: %s", result, result.error_message) finally: if result.status == Result.STATUS_PENDING: result.status = Result.STATUS_STOPPED self.after(result) return result
def document_to_pdf(self, file_path, temp_dir): """Converts an office document to PDF.""" if self.is_unoconv_available(): return self.unoconv_to_pdf(file_path, temp_dir) instance_dir = join_path(temp_dir, 'soffice_instance') out_dir = join_path(temp_dir, 'soffice_output') make_directory(out_dir) log.info('Converting [%s] to PDF...', self.result) instance_dir = '-env:UserInstallation=file://{}'.format(instance_dir) self.exec_command('soffice', instance_dir, '--nofirststartwizard', '--norestore', '--nologo', '--nodefault', '--nolockcheck', '--invisible', '--headless', '--convert-to', 'pdf', '--outdir', out_dir, file_path) for out_file in os.listdir(out_dir): return join_path(out_dir, out_file) msg = "Failed to convert to PDF: {}".format(file_path) raise ProcessingException(msg)
def ingest(self, file_path): self.ooxml_extract_metadata(file_path) try: book = load_workbook(file_path, read_only=True) except Exception as err: raise ProcessingException('Invalid Excel file: %s' % err) self.result.flag(self.result.FLAG_WORKBOOK) try: for name in book.sheetnames: rows = self.generate_csv(book[name]) self.csv_child_iter(rows, name) except Exception as err: raise ProcessingException('Cannot read Excel file: %s' % err) finally: book.close()
def unpack_file(self, file_path, temp_file): try: with bz2.BZ2File(file_path) as src: with open(temp_file, "wb") as dst: shutil.copyfileobj(src, dst) except IOError as ioe: raise ProcessingException("Error: %s" % ioe)
def ingest(self, file_path, entity): entity.schema = model.get('Email') try: msg = Message(file_path.as_posix()) except Exception as exc: msg = "Cannot open message file: %s" % exc raise ProcessingException(msg) from exc self.extract_olefileio_metadata(msg, entity) try: self.extract_msg_headers(entity, msg.header) except Exception: log.exception("Cannot parse Outlook-stored headers") entity.add('subject', msg.subject) entity.add('threadTopic', msg.getStringField('0070')) entity.add('encoding', msg.encoding) entity.add('bodyText', msg.body) entity.add('bodyHtml', msg.htmlBody) entity.add('messageId', self.parse_message_ids(msg.message_id)) if not entity.has('inReplyTo'): entity.add('inReplyTo', self.parse_references(msg.references, [])) try: date = parsedate_to_datetime(msg.date).isoformat() entity.add('date', date) except Exception: log.warning("Could not parse date: %s", msg.date) # sender name and email sender = self.get_identities(msg.sender) self.apply_identities(entity, sender, 'emitters', 'sender') # received by sender = self.get_identity(msg.getStringField('0040'), msg.getStringField('0076')) self.apply_identities(entity, sender, 'emitters') froms = self.get_identities(msg.getStringField('1046')) self.apply_identities(entity, froms, 'emitters', 'from') tos = self.get_identities(msg.to) self.apply_identities(entity, tos, 'recipients', 'to') ccs = self.get_identities(msg.cc) self.apply_identities(entity, ccs, 'recipients', 'cc') bccs = self.get_identities(msg.bcc) self.apply_identities(entity, bccs, 'recipients', 'bcc') self.resolve_message_ids(entity) for attachment in msg.attachments: if attachment.type != 'data': continue name = stringify(attachment.longFilename) name = name or stringify(attachment.shortFilename) self.ingest_attachment(entity, name, attachment.type, attachment.data)
def ingest_message(self, data): try: msg = mime.from_string(data) if msg.headers is not None: self.extract_headers_metadata(msg.headers.items()) except DecodingError as derr: raise ProcessingException('Cannot parse email: %s' % derr) try: if msg.subject: self.update('title', str(msg.subject)) except DecodingError as derr: log.warning("Decoding subject: %s", derr) try: if msg.message_id: self.update('message_id', str(msg.message_id)) except DecodingError as derr: log.warning("Decoding message ID: %s", derr) self.extract_plain_text_content(None) self.result.flag(self.result.FLAG_EMAIL) bodies = defaultdict(list) for part in msg.walk(with_self=True): try: if part.body is None: continue except (DecodingError, ValueError) as de: log.warning("Cannot decode part [%s]: %s", self.result, de) continue file_name = part.detected_file_name # HACK HACK HACK - WTF flanker? # Disposition headers can have multiple filename declarations, # flanker decides to concatenate. if file_name is not None and len(file_name) > 4: half = len(file_name)//2 if file_name[:half] == file_name[half:]: file_name = file_name[:half] mime_type = str(part.detected_content_type) mime_type = normalize_mimetype(mime_type) if part.is_attachment(): self.ingest_attachment(file_name, mime_type, part.body) if part.is_body(): bodies[mime_type].append(part.body) if 'text/html' in bodies: self.extract_html_content('\n\n'.join(bodies['text/html'])) self.result.flag(self.result.FLAG_HTML) if 'text/plain' in bodies: self.extract_plain_text_content('\n\n'.join(bodies['text/plain'])) self.result.flag(self.result.FLAG_PLAINTEXT)
def ingest(self, file_path): self.extract_ole_metadata(file_path) try: book = xlrd.open_workbook(file_path, formatting_info=False) except Exception as err: raise ProcessingException('Invalid Excel file: %s' % err) self.result.flag(self.result.FLAG_WORKBOOK) try: for sheet in book.sheets(): rows = self.generate_csv(sheet) self.csv_child_iter(rows, sheet.name) except XLRDError as err: raise ProcessingException('Invalid Excel file: %s' % err) finally: book.release_resources()
def ingest(self, file_path, entity): entity.schema = model.get('Table') try: table = Table(file_path.as_posix()).open() self.emit_row_dicts(entity, self.generate_rows(table)) except DbfError as err: raise ProcessingException('Cannot open DBF file: %s' % err) from err # noqa
def ingest(self, file_path, entity): try: entity.schema = model.get("Audio") metadata = MediaInfo.parse(file_path) for track in metadata.tracks: entity.add("title", track.title) entity.add("generator", track.writing_application) entity.add("generator", track.writing_library) entity.add("generator", track.publisher) entity.add( "authoredAt", self.parse_timestamp(track.recorded_date) ) # noqa entity.add( "authoredAt", self.parse_timestamp(track.tagged_date) ) # noqa entity.add( "authoredAt", self.parse_timestamp(track.encoded_date) ) # noqa modified_at = self.parse_timestamp( track.file_last_modification_date ) # noqa entity.add("modifiedAt", modified_at) if track.sampling_rate: entity.add("samplingRate", track.sampling_rate) entity.add("duration", track.duration) except Exception as ex: raise ProcessingException("Could not read audio: %r", ex) from ex
def unpack_file(self, file_path, temp_file): try: with gzip.GzipFile(file_path) as src: with open(temp_file, 'wb') as dst: shutil.copyfileobj(src, dst) except IOError as ioe: raise ProcessingException('Error: %s' % ioe)
def unpack(self, file_path, temp_dir): # FIXME: need to figure out how to unpack multi-part files. try: with rarfile.RarFile(file_path) as rf: self.unpack_members(rf, temp_dir) except rarfile.Error as err: raise ProcessingException('Invalid RAR file: %s' % err)
def ingest(self, file_path): self.result.flag(self.result.FLAG_TABULAR) try: table = Table(file_path).open() self.result.emit_rows(self.generate_rows(table)) except DbfError as err: raise ProcessingException('Cannot open DBF file: %s' % err)
def ingest(self, file_path, entity): for file_size in entity.get("fileSize"): if int(file_size) > self.MAX_SIZE: raise ProcessingException("JSON file is too large.") with open(file_path, "rb") as fh: encoding = self.detect_stream_encoding(fh) with open(file_path, "r", encoding=encoding) as fh: try: data = json.load(fh) for idx, text in enumerate(self._collect_text(data)): self.manager.emit_text_fragment(entity, [text], idx) except Exception as exc: raise ProcessingException("Cannot parse JSON file: %s" % exc) from exc
def parse_xml_path(self, file_path, **kwargs): if isinstance(file_path, Path): file_path = file_path.as_posix() try: parser = self.get_xml_parser(**kwargs) return etree.parse(file_path, parser) except (ParserError, ParseError, XMLSyntaxError) as exc: raise ProcessingException("Failed to parse XML: %s" % exc) from exc
def ingest(self, file_path): """Ingestor implementation.""" file_size = self.result.size or os.path.getsize(file_path) if file_size > self.MAX_SIZE: raise ProcessingException("XML file is too large.") try: doc = etree.parse(file_path) except (ParserError, ParseError): raise ProcessingException("XML could not be parsed.") text = self.extract_html_text(doc.getroot()) transform = etree.XSLT(self.XSLT) html_doc = transform(doc) html_body = html.tostring(html_doc, encoding=str, pretty_print=True) self.result.flag(self.result.FLAG_HTML) self.result.emit_html_body(html_body, text)
def ingest(self, file_path, entity): entity.schema = model.get("Email") try: doc = self.parse_xml_path(file_path) except TypeError as te: raise ProcessingException("Cannot parse OPF XML file.") from te if len(doc.findall("//email")) != 1: raise ProcessingException("More than one email in file.") email = doc.find("//email") props = email.getchildren() props = {c.tag: stringify(c.text) for c in props if c.text} # from pprint import pformat # log.info(pformat(props)) entity.add("subject", props.pop("OPFMessageCopySubject", None)) entity.add("threadTopic", props.pop("OPFMessageCopyThreadTopic", None)) entity.add("summary", props.pop("OPFMessageCopyPreview", None)) # message IDs are already parsed, no need to clean prior: entity.add("messageId", props.pop("OPFMessageCopyMessageID", None)) entity.add("date", self.get_date(props, "OPFMessageCopySentTime")) entity.add("modifiedAt", self.get_date(props, "OPFMessageCopyModDate")) senders = self.get_contacts(email, "OPFMessageCopySenderAddress") self.apply_identities(entity, senders, "emitters", "sender") froms = self.get_contacts(email, "OPFMessageCopyFromAddresses") self.apply_identities(entity, froms, "emitters", "from") tos = self.get_contacts(email, "OPFMessageCopyToAddresses") self.apply_identities(entity, tos, "recipients", "to") ccs = self.get_contacts(email, "OPFMessageCopyCCAddresses") self.apply_identities(entity, ccs, "recipients", "cc") bccs = self.get_contacts(email, "OPFMessageCopyBCCAddresses") self.apply_identities(entity, bccs, "recipients", "bcc") entity.add("bodyText", props.pop("OPFMessageCopyBody", None)) html = props.pop("OPFMessageCopyHTMLBody", None) has_html = "1E0" == props.pop("OPFMessageGetHasHTML", None) if has_html and stringify(html): self.extract_html_content(entity, html, extract_metadata=False) self.resolve_message_ids(entity)
def exec_command(self, command, *args): binary = self.find_command(command) if binary is None: raise RuntimeError("Program not found: %s" % command) cmd = [binary] cmd.extend([path_string(a) for a in args]) try: code = subprocess.call(cmd, timeout=self.COMMAND_TIMEOUT, stdout=open(os.devnull, 'wb')) except (IOError, OSError) as ose: raise ProcessingException('Error: %s' % ose) from ose except subprocess.TimeoutExpired as timeout: raise ProcessingException('Processing timed out.') from timeout if code != 0: raise ProcessingException('Failed: %s' % ' '.join(cmd))
def ingest(self, file_path, entity): """Ingestor implementation.""" entity.schema = model.get("HyperText") for file_size in entity.get("fileSize"): if int(file_size) > self.MAX_SIZE: raise ProcessingException("XML file is too large.") doc = self.parse_xml_path(file_path) text = self.extract_html_text(doc.getroot()) entity.set("bodyText", text) try: transform = etree.XSLT(self.XSLT) html_doc = transform(doc) html_body = html.tostring(html_doc, encoding=str, pretty_print=True) entity.set("bodyHtml", html_body) except ValueError as ve: raise ProcessingException("Error converting XML file: %s" % ve) from ve