def match(cls, file_path, result=None): mime_types = [normalize_mimetype(m, default=None) for m in cls.MIME_TYPES] # noqa mime_types = [m for m in mime_types if m is not None] mime_type = normalize_mimetype(result.mime_type, default=None) if mime_type in mime_types: return cls.SCORE extensions = [normalize_extension(e) for e in cls.EXTENSIONS] extensions = [e for e in extensions if e is not None] extension = normalize_extension(result.file_name) if extension in extensions: return cls.SCORE return -1
def ingest_message(self, data): try: msg = mime.from_string(data) if msg.headers is not None: self.extract_headers_metadata(msg.headers.items()) except DecodingError as derr: raise ProcessingException('Cannot parse email: %s' % derr) try: if msg.subject: self.update('title', str(msg.subject)) except DecodingError as derr: log.warning("Decoding subject: %s", derr) try: if msg.message_id: self.update('message_id', str(msg.message_id)) except DecodingError as derr: log.warning("Decoding message ID: %s", derr) self.extract_plain_text_content(None) self.result.flag(self.result.FLAG_EMAIL) bodies = defaultdict(list) for part in msg.walk(with_self=True): try: if part.body is None: continue except (DecodingError, ValueError) as de: log.warning("Cannot decode part [%s]: %s", self.result, de) continue file_name = part.detected_file_name # HACK HACK HACK - WTF flanker? # Disposition headers can have multiple filename declarations, # flanker decides to concatenate. if file_name is not None and len(file_name) > 4: half = len(file_name) // 2 if file_name[:half] == file_name[half:]: file_name = file_name[:half] mime_type = str(part.detected_content_type) mime_type = normalize_mimetype(mime_type) if part.is_attachment(): self.ingest_attachment(file_name, mime_type, part.body) if part.is_body(): bodies[mime_type].append(part.body) if 'text/html' in bodies: self.extract_html_content('\n\n'.join(bodies['text/html'])) self.result.flag(self.result.FLAG_HTML) if 'text/plain' in bodies: self.extract_plain_text_content('\n\n'.join(bodies['text/plain'])) self.result.flag(self.result.FLAG_PLAINTEXT)
async def convert(request): acquired = lock.acquire(blocking=False) if not acquired: return web.Response(status=503) data = await request.post() upload = data['file'] fd, upload_file = mkstemp() out_file = None try: os.close(fd) with open(upload_file, 'wb') as fh: shutil.copyfileobj(upload.file, fh, BUFFER_SIZE) extension = normalize_extension(upload.filename) mime_type = normalize_mimetype(upload.content_type, default=None) filters = list(FORMATS.get_filters(extension, mime_type)) timeout = int(request.query.get('timeout', 300)) timeout = max(10, timeout - 5) await converter.prepare() await asyncio.sleep(0) out_file = converter.convert_file(upload_file, filters, timeout=timeout) out_size = 0 if os.path.exists(out_file): out_size = os.path.getsize(out_file) lock.release() await asyncio.sleep(0) response = web.StreamResponse() response.content_length = out_size response.content_type = 'application/pdf' await response.prepare(request) with open(out_file, 'rb') as f: while True: chunk = f.read(BUFFER_SIZE) if not chunk: break await response.write(chunk) return response except Exception as exc: log.exception('Conversion failed.') converter.terminate() lock.release() status = 400 if isinstance(exc, ConversionFailure) else 503 return web.Response(text=str(exc), status=status) finally: if os.path.exists(upload_file): os.remove(upload_file) if out_file is not None and os.path.exists(out_file): os.remove(out_file)
def mime_type(self): mime_type = self.meta.get('mime_type') if mime_type is None and self.file_name: mime_type, _ = mimetypes.guess_type(self.file_name) # derive mime type from headers if mime_type is None: mime_type = self.headers.get('content_type') mime_type = normalize_mimetype(mime_type) if mime_type != DEFAULT: return mime_type
async def convert(request): data = await request.post() upload = data['file'] extension = normalize_extension(upload.filename) mime_type = normalize_mimetype(upload.content_type, default=None) log.info('PDF convert: %s [%s]', upload.filename, mime_type) fd, upload_file = mkstemp() os.close(fd) fd, out_file = mkstemp(suffix='.pdf') os.close(fd) try: with open(upload_file, 'wb') as fh: shutil.copyfileobj(upload.file, fh, BUFFER_SIZE) filters = list(FORMATS.get_filters(extension, mime_type)) timeout = int(request.query.get('timeout', 300)) await asyncio.sleep(0) converter.convert_file(upload_file, out_file, filters, timeout=timeout) out_size = os.path.getsize(out_file) if out_size == 0: raise ConversionFailure("Could not convert.") await asyncio.sleep(0) response = web.StreamResponse() response.content_length = out_size response.content_type = 'application/pdf' await response.prepare(request) with open(out_file, 'rb') as f: while True: chunk = f.read(BUFFER_SIZE) if not chunk: break await response.write(chunk) return response except ConversionFailure as fail: log.info("Failed to convert: %s", fail) return web.Response(text=str(fail), status=400) except Exception as exc: log.exception('System error: %s.', exc) converter.terminate() finally: os.remove(upload_file) os.remove(out_file)
def __init__(self): self.media_types = defaultdict(list) self.extensions = defaultdict(list) for xcd_file in self.FILES: doc = etree.parse(xcd_file) path = './*[@oor:package="org.openoffice.TypeDetection"]/node/node' for tnode in doc.xpath(path, namespaces=NS): node = {} for prop in tnode.findall('./prop'): name = prop.get(NAME) for value in prop.findall('./value'): node[name] = value.text name = node.get('PreferredFilter', tnode.get(NAME)) media_type = normalize_mimetype(node.get('MediaType'), default=None) if media_type is not None: self.media_types[media_type].append(name) for ext in parse_extensions(node.get('Extensions')): self.extensions[ext].append(name)
def auction(self, file_path, result): if not is_file(file_path): result.mime_type = DirectoryIngestor.MIME_TYPE return DirectoryIngestor if not useful_mimetype(result.mime_type): mime_type = self.MAGIC.from_file(file_path) result.mime_type = normalize_mimetype(mime_type) best_score, best_cls = 0, None for cls in self.ingestors: result.manager = self score = cls.match(file_path, result=result) if score > best_score: best_score = score best_cls = cls if best_cls is None: raise ProcessingException("Format not supported: %s" % result.mime_type) return best_cls
def content_type(self): content_type = self.headers.get('content-type') return normalize_mimetype(content_type)
def configure(self): self.clean = normalize_mimetype(self.value)