コード例 #1
0
def directory(context, data):
    """Store the collected files to a given directory."""
    with context.http.rehash(data) as result:
        if not result.ok:
            return

        content_hash = data.get('content_hash')
        if content_hash is None:
            context.emit_warning("No content hash in data.")
            return

        path = _get_directory_path(context)
        file_name = data.get('file_name', result.file_name)
        mime_type = normalize_mimetype(data.get('headers', {}).get('Content-Type'))
        extension = _get_file_extension(file_name, mime_type)
        file_name = file_name or 'data'
        file_name = safe_filename(file_name, extension=extension)
        file_name = '%s.%s' % (content_hash, file_name)
        data['_file_name'] = file_name
        file_path = os.path.join(path, file_name)
        if not os.path.exists(file_path):
            shutil.copyfile(result.file_path, file_path)

        context.log.info("Store [directory]: %s", file_name)
        meta_path = os.path.join(path, '%s.json' % content_hash)
        with open(meta_path, 'w') as fh:
            json.dump(data, fh)
コード例 #2
0
ファイル: app.py プロジェクト: vishalbelsare/ingestors
def convert():
    upload_file = None
    if not lock.lock():
        return ("BUSY", 503)
    try:
        converter.prepare()
        timeout = int(request.args.get("timeout", MAX_TIMEOUT))
        upload = request.files.get("file")
        file_name = FileName(upload.filename)
        mime_type = normalize_mimetype(upload.mimetype)
        if not file_name.has_extension:
            file_name.extension = extensions.get(mime_type)
        if not file_name.has_extension:
            file_name.extension = mimetype_extension(mime_type)
        upload_file = os.path.join(CONVERT_DIR, file_name.safe())
        log.info("PDF convert: %s [%s]", upload_file, mime_type)
        upload.save(upload_file)
        out_file = converter.convert_file(upload_file, timeout)
        return send_file(out_file, mimetype=PDF)
    except ConversionFailure as ex:
        converter.kill()
        return (str(ex), 400)
    except Exception as ex:
        converter.kill()
        log.warning("Error: %s", ex)
        return (str(ex), 500)
    finally:
        lock.unlock()
コード例 #3
0
def load_mime_extensions():
    media_types = {}
    for xcd_file in FILES:
        doc = etree.parse(xcd_file)
        path = './*[@oor:package="org.openoffice.TypeDetection"]/node/node'
        for tnode in doc.xpath(path, namespaces=NS):
            node = {}
            for prop in tnode.findall('./prop'):
                name = prop.get(NAME)
                for value in prop.findall('./value'):
                    node[name] = value.text

            media_type = normalize_mimetype(node.get('MediaType'),
                                            default=None)
            if media_type is None:
                continue

            extensions = node.get('Extensions')
            if extensions is None:
                continue

            extension = normalize_extension(extensions.split(' ')[0])
            if extension is not None:
                media_types[media_type] = extension
    return media_types
コード例 #4
0
ファイル: base.py プロジェクト: resero-labs/ingestors
    def match(cls, file_path, result=None):
        mime_types = [
            normalize_mimetype(m, default=None) for m in cls.MIME_TYPES
        ]  # noqa
        mime_types = [m for m in mime_types if m is not None]
        mime_type = normalize_mimetype(result.mime_type, default=None)
        if mime_type in mime_types:
            return cls.SCORE

        extensions = [normalize_extension(e) for e in cls.EXTENSIONS]
        extensions = [e for e in extensions if e is not None]
        extension = normalize_extension(result.file_name)
        if extension in extensions:
            return cls.SCORE

        return -1
コード例 #5
0
def convert():
    upload_file = None
    acquired = lock.acquire(timeout=1)
    if not acquired:
        return ('BUSY', 503)
    try:
        converter.prepare()
        timeout = int(request.args.get('timeout', 7200))
        for upload in request.files.values():
            file_name = FileName(upload.filename)
            mime_type = normalize_mimetype(upload.mimetype)
            if not file_name.has_extension:
                file_name.extension = extensions.get(mime_type)
            if not file_name.has_extension:
                file_name.extension = mimetype_extension(mime_type)
            upload_file = os.path.join(CONVERT_DIR, file_name.safe())
            log.info('PDF convert: %s [%s]', upload_file, mime_type)
            upload.save(upload_file)
            out_file = converter.convert_file(upload_file, timeout)
            return send_file(out_file,
                             mimetype=PDF,
                             attachment_filename='output.pdf')
        return ('No file uploaded', 400)
    except ConversionFailure as ex:
        converter.dispose()
        return (str(ex), 400)
    except (SystemFailure, Exception) as ex:
        converter.dispose()
        log.warn('Error: %s', ex)
        return ('CRASH', 503)
    finally:
        lock.release()
コード例 #6
0
def directory(context, data):
    """Store the collected files to a given directory."""
    with context.http.rehash(data) as result:
        if not result.ok:
            return

        content_hash = data.get("content_hash")
        if content_hash is None:
            context.emit_warning("No content hash in data.")
            return

        path = _get_directory_path(context)
        file_name = data.get("file_name", result.file_name)
        mime_type = normalize_mimetype(
            CaseInsensitiveDict(data.get("headers", {})).get("content-type")
        )
        extension = _get_file_extension(file_name, mime_type)
        file_name = file_name or "data"
        file_name = safe_filename(file_name, extension=extension)
        file_name = "%s.%s" % (content_hash, file_name)
        data["_file_name"] = file_name
        file_path = os.path.join(path, file_name)
        if not os.path.exists(file_path):
            shutil.copyfile(result.file_path, file_path)

        context.log.info("Store [directory]: %s", file_name)
        meta_path = os.path.join(path, "%s.json" % content_hash)
        with open(meta_path, "w") as fh:
            json.dump(data, fh)
コード例 #7
0
ファイル: msg.py プロジェクト: bobquest33/ingestors
    def ingest_message(self, data):
        try:
            msg = mime.from_string(data)
            if msg.headers is not None:
                self.extract_headers_metadata(msg.headers.items())
        except DecodingError as derr:
            raise ProcessingException('Cannot parse email: %s' % derr)

        try:
            if msg.subject:
                self.update('title', str(msg.subject))
        except DecodingError as derr:
            log.warning("Decoding subject: %s", derr)

        try:
            if msg.message_id:
                self.update('message_id', str(msg.message_id))
        except DecodingError as derr:
            log.warning("Decoding message ID: %s", derr)

        self.extract_plain_text_content(None)
        self.result.flag(self.result.FLAG_EMAIL)
        bodies = defaultdict(list)

        for part in msg.walk(with_self=True):
            try:
                if part.body is None:
                    continue
            except (DecodingError, ValueError) as de:
                log.warning("Cannot decode part [%s]: %s", self.result, de)
                continue

            file_name = part.detected_file_name

            # HACK HACK HACK - WTF flanker?
            # Disposition headers can have multiple filename declarations,
            # flanker decides to concatenate.
            if file_name is not None and len(file_name) > 4:
                half = len(file_name)//2
                if file_name[:half] == file_name[half:]:
                    file_name = file_name[:half]

            mime_type = str(part.detected_content_type)
            mime_type = normalize_mimetype(mime_type)

            if part.is_attachment():
                self.ingest_attachment(file_name,
                                       mime_type,
                                       part.body)

            if part.is_body():
                bodies[mime_type].append(part.body)

        if 'text/html' in bodies:
            self.extract_html_content('\n\n'.join(bodies['text/html']))
            self.result.flag(self.result.FLAG_HTML)

        if 'text/plain' in bodies:
            self.extract_plain_text_content('\n\n'.join(bodies['text/plain']))
            self.result.flag(self.result.FLAG_PLAINTEXT)
コード例 #8
0
 def clean_text(
     self,
     text: str,
     fuzzy: bool = False,
     format: Optional[str] = None,
     proxy: Optional["EntityProxy"] = None,
 ) -> Optional[str]:
     text = normalize_mimetype(text)
     if text != DEFAULT:
         return text
     return None
コード例 #9
0
ファイル: app.py プロジェクト: unic0rn-team/safe-mail
def get_safe_file_obj(uploaded_file):
    file_name = FileName(uploaded_file.filename)
    mime_type = normalize_mimetype(uploaded_file.mimetype)
    if not file_name.has_extension:
        file_name.extension = extensions.get(mime_type)
    if not file_name.has_extension:
        file_name.extension = mimetype_extension(mime_type)
    fd, upload_file = mkstemp(suffix=file_name.safe())
    os.close(fd)
    log.info('PDF convert: %s [%s]', upload_file, mime_type)
    uploaded_file.save(upload_file)
    return {'file_obj': upload_file, 'file_name': file_name.safe()}
コード例 #10
0
ファイル: msg.py プロジェクト: stofstar/aleph
    def extract_msg_body(self, entity, part):
        if part.is_attachment() or part.is_multipart():
            return
        mime_type = normalize_mimetype(part.get_content_type())
        payload = part.get_payload(decode=True)
        charset = part.get_content_charset()
        payload = self.decode_string(payload, charset)

        if 'text/html' in mime_type:
            self.extract_html_content(entity, payload, extract_metadata=False)
        if 'text/plain' in mime_type:
            entity.add('bodyText', payload)
コード例 #11
0
    def mime_type(self):
        mime_type = self.meta.get('mime_type')

        if mime_type is None and self.file_name:
            mime_type, _ = mimetypes.guess_type(self.file_name)

        # derive mime type from headers
        if mime_type is None:
            mime_type = self.headers.get('content_type')

        mime_type = normalize_mimetype(mime_type)
        if mime_type != DEFAULT:
            return mime_type
コード例 #12
0
    def extract_msg_body(self, entity, part):
        if part.is_attachment() or part.is_multipart():
            return
        mime_type = normalize_mimetype(part.get_content_type())
        payload = part.get_payload(decode=True)
        charset = part.get_content_charset()
        if charset is not None:
            # TODO: do we want to do chardet after decoding fails?
            payload = payload.decode(charset, 'replace')

        if 'text/html' in mime_type:
            self.extract_html_content(entity, payload, extract_metadata=False)
        if 'text/plain' in mime_type:
            entity.add('bodyText', payload)
コード例 #13
0
ファイル: ingestor.py プロジェクト: vishalbelsare/ingestors
    def match(cls, file_path, entity):
        mime_types = [normalize_mimetype(m, default=None) for m in cls.MIME_TYPES]
        mime_types = [m for m in mime_types if m is not None]
        for mime_type in entity.get("mimeType"):
            if mime_type in mime_types:
                return cls.SCORE

        extensions = [normalize_extension(e) for e in cls.EXTENSIONS]
        for file_name in entity.get("fileName"):
            extension = normalize_extension(file_name)
            if extension is not None and extension in extensions:
                return cls.SCORE

        return -1
コード例 #14
0
async def convert(request):
    data = await request.post()
    upload = data['file']
    extension = normalize_extension(upload.filename)
    mime_type = normalize_mimetype(upload.content_type, default=None)
    log.info('PDF convert: %s [%s]', upload.filename, mime_type)
    fd, upload_file = mkstemp()
    os.close(fd)
    fd, out_file = mkstemp(suffix='.pdf')
    os.close(fd)
    log.info('Source: %s, target: %s', upload_file, out_file)

    try:
        with open(upload_file, 'wb') as fh:
            shutil.copyfileobj(upload.file, fh, BUFFER_SIZE)

        filters = list(FORMATS.get_filters(extension, mime_type))
        timeout = int(request.query.get('timeout', 300))

        await asyncio.sleep(0)
        converter.convert_file(upload_file, out_file, filters,
                               timeout=timeout)
        out_size = os.path.getsize(out_file)
        if out_size == 0:
            raise ConversionFailure("No PDF version was generated.")
        await asyncio.sleep(0)

        response = web.StreamResponse()
        response.content_length = out_size
        response.content_type = 'application/pdf'
        await response.prepare(request)
        with open(out_file, 'rb') as f:
            while True:
                chunk = f.read(BUFFER_SIZE)
                if not chunk:
                    break
                await response.write(chunk)
        return response
    except ConversionFailure as fail:
        log.info("Failed to convert: %s", fail)
        return web.Response(text=str(fail), status=400)
    except Exception as exc:
        log.exception('System error: %s.', exc)
        converter.terminate()
    finally:
        os.remove(upload_file)
        os.remove(out_file)
コード例 #15
0
def convert():
    acquired = lock.acquire(timeout=1)
    if app.is_dead or not acquired:
        return ("BUSY", 503)
    timeout = int(request.args.get('timeout', 1000))
    upload_file = None
    output_format = request.form.get('format')
    if not output_format in LIBREOFFICE_EXPORT_TYPES:
        return ("%s format is not supported" % (output_format), 400)
    try:
        for upload in request.files.values():
            file_name = FileName(upload.filename)
            mime_type = normalize_mimetype(upload.mimetype)
            if not file_name.has_extension:
                file_name.extension = extensions.get(mime_type)
            if not file_name.has_extension:
                file_name.extension = mimetype_extension(mime_type)
            fd, upload_file = mkstemp(suffix=file_name.safe())
            os.close(fd)
            log.info('Convert to %s: %s [%s]',
                     output_format, upload_file, mime_type)
            upload.save(upload_file)
            converter.convert_file(upload_file, output_format, timeout)
            output_filename = "%s.%s" % (converter.OUT, output_format)
            log.info("Send file %s [Mime-type: %s]" %
                     (output_filename, OUTPUT_MIME_TYPES[output_format]))
            return send_file(output_filename,
                             mimetype=OUTPUT_MIME_TYPES[output_format],
                             attachment_filename=output_filename)
        return ('No file uploaded', 400)
    except HTTPException:
        raise
    except ConversionFailure as ex:
        app.is_dead = True
        return (str(ex), 400)
    except Exception as ex:
        app.is_dead = True
        log.error('Error: %s', ex)
        return ('FAIL', 503)
    finally:
        if upload_file is not None and os.path.exists(upload_file):
            os.unlink(upload_file)
        if os.path.exists(converter.OUT):
            os.unlink(converter.OUT)
        lock.release()
コード例 #16
0
 def parse_part(self, entity, part):
     if part.is_multipart():
         return
     mime_type = normalize_mimetype(part.get_content_type())
     file_name = part.get_filename()
     is_attachment = part.is_attachment()
     is_attachment = is_attachment or file_name is not None
     is_attachment = is_attachment or mime_type not in self.BODY_TYPES
     if is_attachment:
         payload = part.get_payload(decode=True)
         self.ingest_attachment(entity, file_name, mime_type, payload)
     elif self.BODY_HTML in mime_type:
         payload = self.decode_part(part)
         self.extract_html_content(entity, payload, extract_metadata=False)
     elif self.BODY_PLAIN in mime_type:
         entity.add('bodyText', self.decode_part(part))
     else:
         log.error("Dangling MIME fragment: %s", part)
コード例 #17
0
    def ingest(self, file_path, entity):
        entity.schema = model.get('Email')
        try:
            with open(file_path, 'rb') as fh:
                msg = email.message_from_binary_file(fh, policy=default)
        except MessageError as err:
            raise ProcessingException('Cannot parse email: %s' % err) from err

        self.extract_msg_headers(entity, msg)
        self.extract_msg_body(entity, msg)
        self.resolve_message_ids(entity)

        for part in msg.walk():
            self.extract_msg_body(entity, part)
            if part.is_attachment():
                mime_type = normalize_mimetype(part.get_content_type())
                payload = part.get_payload(decode=True)
                file_name = part.get_filename()
                self.ingest_attachment(entity, file_name, mime_type, payload)
コード例 #18
0
def convert():
    if app.is_dead:
        return ("DEAD", 500)
    upload_file = None
    acquired = lock.acquire(timeout=1)
    if not acquired:
        return ("BUSY", 503)
    try:
        timeout = int(request.args.get('timeout', 100))
        for upload in request.files.values():
            file_name = FileName(upload.filename)
            mime_type = normalize_mimetype(upload.mimetype)
            if not file_name.has_extension:
                file_name.extension = extensions.get(mime_type)
            if not file_name.has_extension:
                file_name.extension = mimetype_extension(mime_type)
            fd, upload_file = mkstemp(suffix=file_name.safe())
            os.close(fd)
            log.info('PDF convert: %s [%s]', upload_file, mime_type)
            upload.save(upload_file)
            log.info('About to begin conversion.')
            call('libreoffice --headless --convert-to pdf --outdir %s %s ' %
                 ('/tmp/', upload_file),
                 shell=True)
            return send_file('/tmp/output.pdf',
                             mimetype='application/pdf',
                             attachment_filename='output.pdf')
        return ('No file uploaded', 400)
    except HTTPException:
        raise
    except ConversionFailure as ex:
        app.is_dead = True
        return (str(ex), 400)
    except Exception as ex:
        app.is_dead = True
        log.error('Error: %s', ex)
        return ('FAIL', 503)
    finally:
        if upload_file is not None and os.path.exists(upload_file):
            os.unlink(upload_file)
        if os.path.exists('/tmp/output.pdf'):
            os.unlink('/tmp/output.pdf')
        lock.release()
コード例 #19
0
    def auction(self, file_path, result):
        if not is_file(file_path):
            result.mime_type = DirectoryIngestor.MIME_TYPE
            return DirectoryIngestor

        if not useful_mimetype(result.mime_type):
            mime_type = self.MAGIC.from_file(file_path)
            result.mime_type = normalize_mimetype(mime_type)

        best_score, best_cls = 0, None
        for cls in self.ingestors:
            result.manager = self
            score = cls.match(file_path, result=result)
            if score > best_score:
                best_score = score
                best_cls = cls

        if best_cls is None:
            raise ProcessingException("Format not supported: %s" %
                                      result.mime_type)
        return best_cls
コード例 #20
0
ファイル: formats.py プロジェクト: stofstar/aleph
    def __init__(self):
        self.media_types = defaultdict(list)
        self.extensions = defaultdict(list)
        for xcd_file in self.FILES:
            doc = etree.parse(xcd_file)
            path = './*[@oor:package="org.openoffice.TypeDetection"]/node/node'
            for tnode in doc.xpath(path, namespaces=NS):
                node = {}
                for prop in tnode.findall('./prop'):
                    name = prop.get(NAME)
                    for value in prop.findall('./value'):
                        node[name] = value.text

                name = node.get('PreferredFilter', tnode.get(NAME))
                media_type = normalize_mimetype(node.get('MediaType'),
                                                default=None)
                if media_type is not None:
                    self.media_types[media_type].append(name)

                for ext in self.parse_extensions(node.get('Extensions')):
                    self.extensions[ext].append(name)
コード例 #21
0
def convert():
    acquired = lock.acquire(timeout=2)
    if not acquired:
        return ("BUSY", 503)
    try:
        if os.path.exists(OUT_PATH):
            os.unlink(OUT_PATH)

        upload = request.files['file']
        extension = normalize_extension(upload.filename)
        mime_type = normalize_mimetype(upload.mimetype, default=None)
        if extension is None:
            extension = extensions.get(mime_type)
        log.info('PDF convert: %s [%s]', upload.filename, mime_type)
        fd, upload_file = mkstemp(suffix='.%s' % extension)
        fh = os.fdopen(fd, mode='wb')
        upload.save(fh)
        fh.close()
        if listener.poll() is not None:
            log.error("Listener has terminated.")
            app.is_dead = True
            return ("DEAD", 503)

        args = [
            'unoconv', '-f', 'pdf', '-o', OUT_PATH, '-i',
            'MacroExecutionMode=0', '-i', 'ReadOnly=1', '-e',
            'SelectPdfVersion=1', '-e', 'MaxImageResolution=300',
            '--no-launch', upload_file
        ]
        err = subprocess.call(args, timeout=TIMEOUT)
        if err != 0 or not os.path.exists(OUT_PATH):
            return ('The document could not be converted to PDF.', 400)
        return send_file(OUT_PATH)
    except subprocess.TimeoutExpired:
        log.error("Timeout exceeded: %s", upload.filename)
        app.is_dead = True
        return ('Processing the document timed out.', 400)
    finally:
        lock.release()
コード例 #22
0
 def clean_text(self, text, **kwargs):
     text = normalize_mimetype(text)
     if text != DEFAULT:
         return text
コード例 #23
0
 def content_type(self):
     content_type = self.headers.get('content-type')
     return normalize_mimetype(content_type)
コード例 #24
0
 def store(self, file_path, mime_type=None):
     file_path = ensure_path(file_path)
     mime_type = normalize_mimetype(mime_type)
     if file_path is not None and file_path.is_file():
         return self.archive.archive_file(file_path, mime_type=mime_type)
コード例 #25
0
 def configure(self):
     self.clean = normalize_mimetype(self.value)