コード例 #1
0
ファイル: ingestor.py プロジェクト: vishalbelsare/ingestors
    def match(cls, file_path, entity):
        mime_types = [normalize_mimetype(m, default=None) for m in cls.MIME_TYPES]
        mime_types = [m for m in mime_types if m is not None]
        for mime_type in entity.get("mimeType"):
            if mime_type in mime_types:
                return cls.SCORE

        extensions = [normalize_extension(e) for e in cls.EXTENSIONS]
        for file_name in entity.get("fileName"):
            extension = normalize_extension(file_name)
            if extension is not None and extension in extensions:
                return cls.SCORE

        return -1
コード例 #2
0
def load_mime_extensions():
    media_types = {}
    for xcd_file in FILES:
        doc = etree.parse(xcd_file)
        path = './*[@oor:package="org.openoffice.TypeDetection"]/node/node'
        for tnode in doc.xpath(path, namespaces=NS):
            node = {}
            for prop in tnode.findall('./prop'):
                name = prop.get(NAME)
                for value in prop.findall('./value'):
                    node[name] = value.text

            media_type = normalize_mimetype(node.get('MediaType'),
                                            default=None)
            if media_type is None:
                continue

            extensions = node.get('Extensions')
            if extensions is None:
                continue

            extension = normalize_extension(extensions.split(' ')[0])
            if extension is not None:
                media_types[media_type] = extension
    return media_types
コード例 #3
0
ファイル: base.py プロジェクト: resero-labs/ingestors
    def match(cls, file_path, result=None):
        mime_types = [
            normalize_mimetype(m, default=None) for m in cls.MIME_TYPES
        ]  # noqa
        mime_types = [m for m in mime_types if m is not None]
        mime_type = normalize_mimetype(result.mime_type, default=None)
        if mime_type in mime_types:
            return cls.SCORE

        extensions = [normalize_extension(e) for e in cls.EXTENSIONS]
        extensions = [e for e in extensions if e is not None]
        extension = normalize_extension(result.file_name)
        if extension in extensions:
            return cls.SCORE

        return -1
コード例 #4
0
ファイル: formats.py プロジェクト: stofstar/aleph
 def parse_extensions(self, extensions):
     if extensions is not None:
         for ext in extensions.split(' '):
             if ext == '*':
                 continue
             ext = normalize_extension(ext)
             if ext is not None:
                 yield ext
コード例 #5
0
async def convert(request):
    data = await request.post()
    upload = data['file']
    extension = normalize_extension(upload.filename)
    mime_type = normalize_mimetype(upload.content_type, default=None)
    log.info('PDF convert: %s [%s]', upload.filename, mime_type)
    fd, upload_file = mkstemp()
    os.close(fd)
    fd, out_file = mkstemp(suffix='.pdf')
    os.close(fd)
    log.info('Source: %s, target: %s', upload_file, out_file)

    try:
        with open(upload_file, 'wb') as fh:
            shutil.copyfileobj(upload.file, fh, BUFFER_SIZE)

        filters = list(FORMATS.get_filters(extension, mime_type))
        timeout = int(request.query.get('timeout', 300))

        await asyncio.sleep(0)
        converter.convert_file(upload_file, out_file, filters,
                               timeout=timeout)
        out_size = os.path.getsize(out_file)
        if out_size == 0:
            raise ConversionFailure("No PDF version was generated.")
        await asyncio.sleep(0)

        response = web.StreamResponse()
        response.content_length = out_size
        response.content_type = 'application/pdf'
        await response.prepare(request)
        with open(out_file, 'rb') as f:
            while True:
                chunk = f.read(BUFFER_SIZE)
                if not chunk:
                    break
                await response.write(chunk)
        return response
    except ConversionFailure as fail:
        log.info("Failed to convert: %s", fail)
        return web.Response(text=str(fail), status=400)
    except Exception as exc:
        log.exception('System error: %s.', exc)
        converter.terminate()
    finally:
        os.remove(upload_file)
        os.remove(out_file)
コード例 #6
0
def convert():
    acquired = lock.acquire(timeout=2)
    if not acquired:
        return ("BUSY", 503)
    try:
        if os.path.exists(OUT_PATH):
            os.unlink(OUT_PATH)

        upload = request.files['file']
        extension = normalize_extension(upload.filename)
        mime_type = normalize_mimetype(upload.mimetype, default=None)
        if extension is None:
            extension = extensions.get(mime_type)
        log.info('PDF convert: %s [%s]', upload.filename, mime_type)
        fd, upload_file = mkstemp(suffix='.%s' % extension)
        fh = os.fdopen(fd, mode='wb')
        upload.save(fh)
        fh.close()
        if listener.poll() is not None:
            log.error("Listener has terminated.")
            app.is_dead = True
            return ("DEAD", 503)

        args = [
            'unoconv', '-f', 'pdf', '-o', OUT_PATH, '-i',
            'MacroExecutionMode=0', '-i', 'ReadOnly=1', '-e',
            'SelectPdfVersion=1', '-e', 'MaxImageResolution=300',
            '--no-launch', upload_file
        ]
        err = subprocess.call(args, timeout=TIMEOUT)
        if err != 0 or not os.path.exists(OUT_PATH):
            return ('The document could not be converted to PDF.', 400)
        return send_file(OUT_PATH)
    except subprocess.TimeoutExpired:
        log.error("Timeout exceeded: %s", upload.filename)
        app.is_dead = True
        return ('Processing the document timed out.', 400)
    finally:
        lock.release()