def directory(context, data): """Store the collected files to a given directory.""" with context.http.rehash(data) as result: if not result.ok: return content_hash = data.get('content_hash') if content_hash is None: context.emit_warning("No content hash in data.") return path = _get_directory_path(context) file_name = data.get('file_name', result.file_name) mime_type = normalize_mimetype(data.get('headers', {}).get('Content-Type')) extension = _get_file_extension(file_name, mime_type) file_name = file_name or 'data' file_name = safe_filename(file_name, extension=extension) file_name = '%s.%s' % (content_hash, file_name) data['_file_name'] = file_name file_path = os.path.join(path, file_name) if not os.path.exists(file_path): shutil.copyfile(result.file_path, file_path) context.log.info("Store [directory]: %s", file_name) meta_path = os.path.join(path, '%s.json' % content_hash) with open(meta_path, 'w') as fh: json.dump(data, fh)
def convert(): upload_file = None if not lock.lock(): return ("BUSY", 503) try: converter.prepare() timeout = int(request.args.get("timeout", MAX_TIMEOUT)) upload = request.files.get("file") file_name = FileName(upload.filename) mime_type = normalize_mimetype(upload.mimetype) if not file_name.has_extension: file_name.extension = extensions.get(mime_type) if not file_name.has_extension: file_name.extension = mimetype_extension(mime_type) upload_file = os.path.join(CONVERT_DIR, file_name.safe()) log.info("PDF convert: %s [%s]", upload_file, mime_type) upload.save(upload_file) out_file = converter.convert_file(upload_file, timeout) return send_file(out_file, mimetype=PDF) except ConversionFailure as ex: converter.kill() return (str(ex), 400) except Exception as ex: converter.kill() log.warning("Error: %s", ex) return (str(ex), 500) finally: lock.unlock()
def load_mime_extensions(): media_types = {} for xcd_file in FILES: doc = etree.parse(xcd_file) path = './*[@oor:package="org.openoffice.TypeDetection"]/node/node' for tnode in doc.xpath(path, namespaces=NS): node = {} for prop in tnode.findall('./prop'): name = prop.get(NAME) for value in prop.findall('./value'): node[name] = value.text media_type = normalize_mimetype(node.get('MediaType'), default=None) if media_type is None: continue extensions = node.get('Extensions') if extensions is None: continue extension = normalize_extension(extensions.split(' ')[0]) if extension is not None: media_types[media_type] = extension return media_types
def match(cls, file_path, result=None): mime_types = [ normalize_mimetype(m, default=None) for m in cls.MIME_TYPES ] # noqa mime_types = [m for m in mime_types if m is not None] mime_type = normalize_mimetype(result.mime_type, default=None) if mime_type in mime_types: return cls.SCORE extensions = [normalize_extension(e) for e in cls.EXTENSIONS] extensions = [e for e in extensions if e is not None] extension = normalize_extension(result.file_name) if extension in extensions: return cls.SCORE return -1
def convert(): upload_file = None acquired = lock.acquire(timeout=1) if not acquired: return ('BUSY', 503) try: converter.prepare() timeout = int(request.args.get('timeout', 7200)) for upload in request.files.values(): file_name = FileName(upload.filename) mime_type = normalize_mimetype(upload.mimetype) if not file_name.has_extension: file_name.extension = extensions.get(mime_type) if not file_name.has_extension: file_name.extension = mimetype_extension(mime_type) upload_file = os.path.join(CONVERT_DIR, file_name.safe()) log.info('PDF convert: %s [%s]', upload_file, mime_type) upload.save(upload_file) out_file = converter.convert_file(upload_file, timeout) return send_file(out_file, mimetype=PDF, attachment_filename='output.pdf') return ('No file uploaded', 400) except ConversionFailure as ex: converter.dispose() return (str(ex), 400) except (SystemFailure, Exception) as ex: converter.dispose() log.warn('Error: %s', ex) return ('CRASH', 503) finally: lock.release()
def directory(context, data): """Store the collected files to a given directory.""" with context.http.rehash(data) as result: if not result.ok: return content_hash = data.get("content_hash") if content_hash is None: context.emit_warning("No content hash in data.") return path = _get_directory_path(context) file_name = data.get("file_name", result.file_name) mime_type = normalize_mimetype( CaseInsensitiveDict(data.get("headers", {})).get("content-type") ) extension = _get_file_extension(file_name, mime_type) file_name = file_name or "data" file_name = safe_filename(file_name, extension=extension) file_name = "%s.%s" % (content_hash, file_name) data["_file_name"] = file_name file_path = os.path.join(path, file_name) if not os.path.exists(file_path): shutil.copyfile(result.file_path, file_path) context.log.info("Store [directory]: %s", file_name) meta_path = os.path.join(path, "%s.json" % content_hash) with open(meta_path, "w") as fh: json.dump(data, fh)
def ingest_message(self, data): try: msg = mime.from_string(data) if msg.headers is not None: self.extract_headers_metadata(msg.headers.items()) except DecodingError as derr: raise ProcessingException('Cannot parse email: %s' % derr) try: if msg.subject: self.update('title', str(msg.subject)) except DecodingError as derr: log.warning("Decoding subject: %s", derr) try: if msg.message_id: self.update('message_id', str(msg.message_id)) except DecodingError as derr: log.warning("Decoding message ID: %s", derr) self.extract_plain_text_content(None) self.result.flag(self.result.FLAG_EMAIL) bodies = defaultdict(list) for part in msg.walk(with_self=True): try: if part.body is None: continue except (DecodingError, ValueError) as de: log.warning("Cannot decode part [%s]: %s", self.result, de) continue file_name = part.detected_file_name # HACK HACK HACK - WTF flanker? # Disposition headers can have multiple filename declarations, # flanker decides to concatenate. if file_name is not None and len(file_name) > 4: half = len(file_name)//2 if file_name[:half] == file_name[half:]: file_name = file_name[:half] mime_type = str(part.detected_content_type) mime_type = normalize_mimetype(mime_type) if part.is_attachment(): self.ingest_attachment(file_name, mime_type, part.body) if part.is_body(): bodies[mime_type].append(part.body) if 'text/html' in bodies: self.extract_html_content('\n\n'.join(bodies['text/html'])) self.result.flag(self.result.FLAG_HTML) if 'text/plain' in bodies: self.extract_plain_text_content('\n\n'.join(bodies['text/plain'])) self.result.flag(self.result.FLAG_PLAINTEXT)
def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: text = normalize_mimetype(text) if text != DEFAULT: return text return None
def get_safe_file_obj(uploaded_file): file_name = FileName(uploaded_file.filename) mime_type = normalize_mimetype(uploaded_file.mimetype) if not file_name.has_extension: file_name.extension = extensions.get(mime_type) if not file_name.has_extension: file_name.extension = mimetype_extension(mime_type) fd, upload_file = mkstemp(suffix=file_name.safe()) os.close(fd) log.info('PDF convert: %s [%s]', upload_file, mime_type) uploaded_file.save(upload_file) return {'file_obj': upload_file, 'file_name': file_name.safe()}
def extract_msg_body(self, entity, part): if part.is_attachment() or part.is_multipart(): return mime_type = normalize_mimetype(part.get_content_type()) payload = part.get_payload(decode=True) charset = part.get_content_charset() payload = self.decode_string(payload, charset) if 'text/html' in mime_type: self.extract_html_content(entity, payload, extract_metadata=False) if 'text/plain' in mime_type: entity.add('bodyText', payload)
def mime_type(self): mime_type = self.meta.get('mime_type') if mime_type is None and self.file_name: mime_type, _ = mimetypes.guess_type(self.file_name) # derive mime type from headers if mime_type is None: mime_type = self.headers.get('content_type') mime_type = normalize_mimetype(mime_type) if mime_type != DEFAULT: return mime_type
def extract_msg_body(self, entity, part): if part.is_attachment() or part.is_multipart(): return mime_type = normalize_mimetype(part.get_content_type()) payload = part.get_payload(decode=True) charset = part.get_content_charset() if charset is not None: # TODO: do we want to do chardet after decoding fails? payload = payload.decode(charset, 'replace') if 'text/html' in mime_type: self.extract_html_content(entity, payload, extract_metadata=False) if 'text/plain' in mime_type: entity.add('bodyText', payload)
def match(cls, file_path, entity): mime_types = [normalize_mimetype(m, default=None) for m in cls.MIME_TYPES] mime_types = [m for m in mime_types if m is not None] for mime_type in entity.get("mimeType"): if mime_type in mime_types: return cls.SCORE extensions = [normalize_extension(e) for e in cls.EXTENSIONS] for file_name in entity.get("fileName"): extension = normalize_extension(file_name) if extension is not None and extension in extensions: return cls.SCORE return -1
async def convert(request): data = await request.post() upload = data['file'] extension = normalize_extension(upload.filename) mime_type = normalize_mimetype(upload.content_type, default=None) log.info('PDF convert: %s [%s]', upload.filename, mime_type) fd, upload_file = mkstemp() os.close(fd) fd, out_file = mkstemp(suffix='.pdf') os.close(fd) log.info('Source: %s, target: %s', upload_file, out_file) try: with open(upload_file, 'wb') as fh: shutil.copyfileobj(upload.file, fh, BUFFER_SIZE) filters = list(FORMATS.get_filters(extension, mime_type)) timeout = int(request.query.get('timeout', 300)) await asyncio.sleep(0) converter.convert_file(upload_file, out_file, filters, timeout=timeout) out_size = os.path.getsize(out_file) if out_size == 0: raise ConversionFailure("No PDF version was generated.") await asyncio.sleep(0) response = web.StreamResponse() response.content_length = out_size response.content_type = 'application/pdf' await response.prepare(request) with open(out_file, 'rb') as f: while True: chunk = f.read(BUFFER_SIZE) if not chunk: break await response.write(chunk) return response except ConversionFailure as fail: log.info("Failed to convert: %s", fail) return web.Response(text=str(fail), status=400) except Exception as exc: log.exception('System error: %s.', exc) converter.terminate() finally: os.remove(upload_file) os.remove(out_file)
def convert(): acquired = lock.acquire(timeout=1) if app.is_dead or not acquired: return ("BUSY", 503) timeout = int(request.args.get('timeout', 1000)) upload_file = None output_format = request.form.get('format') if not output_format in LIBREOFFICE_EXPORT_TYPES: return ("%s format is not supported" % (output_format), 400) try: for upload in request.files.values(): file_name = FileName(upload.filename) mime_type = normalize_mimetype(upload.mimetype) if not file_name.has_extension: file_name.extension = extensions.get(mime_type) if not file_name.has_extension: file_name.extension = mimetype_extension(mime_type) fd, upload_file = mkstemp(suffix=file_name.safe()) os.close(fd) log.info('Convert to %s: %s [%s]', output_format, upload_file, mime_type) upload.save(upload_file) converter.convert_file(upload_file, output_format, timeout) output_filename = "%s.%s" % (converter.OUT, output_format) log.info("Send file %s [Mime-type: %s]" % (output_filename, OUTPUT_MIME_TYPES[output_format])) return send_file(output_filename, mimetype=OUTPUT_MIME_TYPES[output_format], attachment_filename=output_filename) return ('No file uploaded', 400) except HTTPException: raise except ConversionFailure as ex: app.is_dead = True return (str(ex), 400) except Exception as ex: app.is_dead = True log.error('Error: %s', ex) return ('FAIL', 503) finally: if upload_file is not None and os.path.exists(upload_file): os.unlink(upload_file) if os.path.exists(converter.OUT): os.unlink(converter.OUT) lock.release()
def parse_part(self, entity, part): if part.is_multipart(): return mime_type = normalize_mimetype(part.get_content_type()) file_name = part.get_filename() is_attachment = part.is_attachment() is_attachment = is_attachment or file_name is not None is_attachment = is_attachment or mime_type not in self.BODY_TYPES if is_attachment: payload = part.get_payload(decode=True) self.ingest_attachment(entity, file_name, mime_type, payload) elif self.BODY_HTML in mime_type: payload = self.decode_part(part) self.extract_html_content(entity, payload, extract_metadata=False) elif self.BODY_PLAIN in mime_type: entity.add('bodyText', self.decode_part(part)) else: log.error("Dangling MIME fragment: %s", part)
def ingest(self, file_path, entity): entity.schema = model.get('Email') try: with open(file_path, 'rb') as fh: msg = email.message_from_binary_file(fh, policy=default) except MessageError as err: raise ProcessingException('Cannot parse email: %s' % err) from err self.extract_msg_headers(entity, msg) self.extract_msg_body(entity, msg) self.resolve_message_ids(entity) for part in msg.walk(): self.extract_msg_body(entity, part) if part.is_attachment(): mime_type = normalize_mimetype(part.get_content_type()) payload = part.get_payload(decode=True) file_name = part.get_filename() self.ingest_attachment(entity, file_name, mime_type, payload)
def convert(): if app.is_dead: return ("DEAD", 500) upload_file = None acquired = lock.acquire(timeout=1) if not acquired: return ("BUSY", 503) try: timeout = int(request.args.get('timeout', 100)) for upload in request.files.values(): file_name = FileName(upload.filename) mime_type = normalize_mimetype(upload.mimetype) if not file_name.has_extension: file_name.extension = extensions.get(mime_type) if not file_name.has_extension: file_name.extension = mimetype_extension(mime_type) fd, upload_file = mkstemp(suffix=file_name.safe()) os.close(fd) log.info('PDF convert: %s [%s]', upload_file, mime_type) upload.save(upload_file) log.info('About to begin conversion.') call('libreoffice --headless --convert-to pdf --outdir %s %s ' % ('/tmp/', upload_file), shell=True) return send_file('/tmp/output.pdf', mimetype='application/pdf', attachment_filename='output.pdf') return ('No file uploaded', 400) except HTTPException: raise except ConversionFailure as ex: app.is_dead = True return (str(ex), 400) except Exception as ex: app.is_dead = True log.error('Error: %s', ex) return ('FAIL', 503) finally: if upload_file is not None and os.path.exists(upload_file): os.unlink(upload_file) if os.path.exists('/tmp/output.pdf'): os.unlink('/tmp/output.pdf') lock.release()
def auction(self, file_path, result): if not is_file(file_path): result.mime_type = DirectoryIngestor.MIME_TYPE return DirectoryIngestor if not useful_mimetype(result.mime_type): mime_type = self.MAGIC.from_file(file_path) result.mime_type = normalize_mimetype(mime_type) best_score, best_cls = 0, None for cls in self.ingestors: result.manager = self score = cls.match(file_path, result=result) if score > best_score: best_score = score best_cls = cls if best_cls is None: raise ProcessingException("Format not supported: %s" % result.mime_type) return best_cls
def __init__(self): self.media_types = defaultdict(list) self.extensions = defaultdict(list) for xcd_file in self.FILES: doc = etree.parse(xcd_file) path = './*[@oor:package="org.openoffice.TypeDetection"]/node/node' for tnode in doc.xpath(path, namespaces=NS): node = {} for prop in tnode.findall('./prop'): name = prop.get(NAME) for value in prop.findall('./value'): node[name] = value.text name = node.get('PreferredFilter', tnode.get(NAME)) media_type = normalize_mimetype(node.get('MediaType'), default=None) if media_type is not None: self.media_types[media_type].append(name) for ext in self.parse_extensions(node.get('Extensions')): self.extensions[ext].append(name)
def convert(): acquired = lock.acquire(timeout=2) if not acquired: return ("BUSY", 503) try: if os.path.exists(OUT_PATH): os.unlink(OUT_PATH) upload = request.files['file'] extension = normalize_extension(upload.filename) mime_type = normalize_mimetype(upload.mimetype, default=None) if extension is None: extension = extensions.get(mime_type) log.info('PDF convert: %s [%s]', upload.filename, mime_type) fd, upload_file = mkstemp(suffix='.%s' % extension) fh = os.fdopen(fd, mode='wb') upload.save(fh) fh.close() if listener.poll() is not None: log.error("Listener has terminated.") app.is_dead = True return ("DEAD", 503) args = [ 'unoconv', '-f', 'pdf', '-o', OUT_PATH, '-i', 'MacroExecutionMode=0', '-i', 'ReadOnly=1', '-e', 'SelectPdfVersion=1', '-e', 'MaxImageResolution=300', '--no-launch', upload_file ] err = subprocess.call(args, timeout=TIMEOUT) if err != 0 or not os.path.exists(OUT_PATH): return ('The document could not be converted to PDF.', 400) return send_file(OUT_PATH) except subprocess.TimeoutExpired: log.error("Timeout exceeded: %s", upload.filename) app.is_dead = True return ('Processing the document timed out.', 400) finally: lock.release()
def clean_text(self, text, **kwargs): text = normalize_mimetype(text) if text != DEFAULT: return text
def content_type(self): content_type = self.headers.get('content-type') return normalize_mimetype(content_type)
def store(self, file_path, mime_type=None): file_path = ensure_path(file_path) mime_type = normalize_mimetype(mime_type) if file_path is not None and file_path.is_file(): return self.archive.archive_file(file_path, mime_type=mime_type)
def configure(self): self.clean = normalize_mimetype(self.value)