Ejemplo n.º 1
0
def handle_update(
    db,
    id,
    version,
    ):
    print "doing it"
    doc = db.find_one(id)
    data = doc.raw_data
    magic = magic_buffer(data, mime=True)
    doc.update_plugin_and_canonical_attr('mimetype', magic)
    doc.finish_parsing(version)
    doc.reindex()
Ejemplo n.º 2
0
def handle_update(
    db,
    id,
    version,
):
    print "doing it"
    doc = db.find_one(id)
    data = doc.raw_data
    magic = magic_buffer(data, mime=True)
    doc.update_plugin_and_canonical_attr('mimetype', magic)
    doc.finish_parsing(version)
    doc.reindex()
Ejemplo n.º 3
0
def calc_magic(stream):
    # Missing python-magic features:
    # - magic_descriptor (https://github.com/ahupp/python-magic/pull/227)
    # - direct support for symlink flag
    magic_cookie = magic.magic_open(magic.MAGIC_SYMLINK)
    magic.magic_load(magic_cookie, None)
    try:
        fd_path = get_fd_path(stream)
        if fd_path:
            return magic.maybe_decode(magic.magic_file(magic_cookie, fd_path))
        else:
            # Handle BytesIO in-memory streams
            stream.seek(0, os.SEEK_SET)
            return magic.maybe_decode(magic.magic_buffer(magic_cookie, stream.read()))
    finally:
        magic.magic_close(magic_cookie)
    return None
Ejemplo n.º 4
0
def get_images_from_stream(stream):
    with NamedTemporaryFile() as inputfile:
        data = stream.read()
        magic = magic_buffer(data)
        if magic.startswith('JPEG') or magic.startswith('Netpbm'):
            return [data]
        elif magic.startswith('PDF'):
            tmpdir = mkdtemp()
            inputfile.write(data)
            inputfile.file.flush()
            try:
                call(["pdfimages", "-j", inputfile.name, tmpdir + '/'])
                return [file(path.join(tmpdir, x)).read() for x in
                        listdir(tmpdir)]
            finally:
                rmtree(tmpdir)
        else:
            raise ValueError("Cannot handle file format:", magic)
Ejemplo n.º 5
0
def get_images_from_stream(stream):
    with NamedTemporaryFile() as inputfile:
        data = stream.read()
        magic = magic_buffer(data)
        if magic.startswith('JPEG') or magic.startswith('Netpbm'):
            return [data]
        elif magic.startswith('PDF'):
            tmpdir = mkdtemp()
            inputfile.write(data)
            inputfile.file.flush()
            try:
                call(["pdfimages", "-j", inputfile.name, tmpdir + '/'])
                return [
                    file(path.join(tmpdir, x)).read() for x in listdir(tmpdir)
                ]
            finally:
                rmtree(tmpdir)
        else:
            raise ValueError("Cannot handle file format:", magic)
Ejemplo n.º 6
0
def ident(buf, length):
    minimum = 1000
    sl_tag = 'unknown'

    data = {'ascii': '', 'hex': '', 'magic': '', 'mime': '', 'tag': 'unknown'}

    if length <= 0:
        return data

    header = buf[:min(64, length)]
    data['ascii'] = dotdump(header)
    data['hex'] = hexlify(header)

    # noinspection PyBroadException
    try:
        # Loop over the labels returned by libmagic, ...
        labels = []
        if file_type:
            with magic_lock:
                labels = magic.magic_buffer(file_type, buf).split('\n')

        mimes = []
        if mime_type:
            with magic_lock:
                mimes = magic.magic_buffer(mime_type, buf).split('\n')

        for label, mime in zip(labels, mimes):
            label = dotdump(label)
            mime = dotdump(mime)
            if custom.match(label):
                data['magic'] = label
                data['mime'] = mime
                data['tag'] = label.split('custom: ')[1].strip()

                sl_tag = False
                break

            if mime in trusted_mimes:
                data['magic'] = label
                data['mime'] = mime
                data['tag'] = trusted_mimes[mime]

                sl_tag = False
                break

            # ... match against our patterns and, ...
            index = 0
            for entry in tl_patterns:
                if index >= minimum:
                    break

                if entry[1].search(label):  # pylint:disable=E1101
                    break

                index += 1

            # ... keep highest precedence match.
            if index < minimum:
                data['magic'] = label
                data['mime'] = mime
                minimum = index
                sl_tag = subtype(label)

        if sl_tag:
            tl_tag = sl_to_tl.get(sl_tag, tl_patterns[minimum][0])
            data['tag'] = '/'.join((tl_tag, sl_tag))

    except:  # pylint:disable=W0702
        pass

    if not recognized.get(data['tag'], False):
        data['tag'] = 'unknown'

    if data['tag'] == 'document/office/unknown':
        # noinspection PyBroadException
        try:
            root_entry_property_offset = buf.find(u"Root Entry".encode("utf-16-le"))
            if -1 != root_entry_property_offset:
                # Get root entry's GUID and try to guess document type
                clsid_offset = root_entry_property_offset + 0x50
                if len(buf) >= clsid_offset + 16:
                    clsid = buf[clsid_offset:clsid_offset + 16]
                    if len(clsid) == 16 and clsid != "\0" * len(clsid):
                        clsid_str = uuid.UUID(bytes_le=clsid)
                        clsid_str = clsid_str.urn.rsplit(':', 1)[-1].upper()
                        if clsid_str in OLE_CLSID_GUIDs:
                            data['tag'] = OLE_CLSID_GUIDs[clsid_str]
        except:  # pylint:disable=W0702
            pass

    return data
Ejemplo n.º 7
0
 def from_buffer(self, data, length):
     return magic.magic_buffer(self.cookie, data, length)
Ejemplo n.º 8
0
def ident(buf, length: int) -> Dict:
    data = {
        'ascii': None,
        'hex': None,
        'magic': None,
        'mime': None,
        'type': 'unknown'
    }

    if length <= 0:
        return data

    header = buf[:min(64, length)]
    data['ascii'] = dotdump(header)
    data['hex'] = safe_str(hexlify(header))

    # noinspection PyBroadException
    try:
        # Loop over the labels returned by libmagic, ...
        labels = []
        if file_type:
            with magic_lock:
                labels = magic.magic_buffer(file_type, buf).split(b'\n')
                labels = [
                    label[2:] if label.startswith(b'- ') else label
                    for label in labels
                ]

        mimes = []
        if mime_type:
            with magic_lock:
                mimes = magic.magic_buffer(mime_type, buf).split(b'\n')
                mimes = [
                    mime[2:] if mime.startswith(b'- ') else mime
                    for mime in mimes
                ]

        # For user feedback set the mime and magic meta data to always be the primary
        # libmagic responses
        if len(labels) > 0:
            data['magic'] = safe_str(labels[0])

        if len(mimes) > 0 and mimes[0] != b'':
            data['mime'] = safe_str(mimes[0])

        # Highest priority is given to mime type matching something
        tagged = False

        for label in labels:
            label = dotdump(label)

            if custom.match(label):
                data['type'] = label.split('custom: ')[1].strip()
                tagged = True
                break

        # Second priority is mime times marked as trusted
        if not tagged:
            for mime in mimes:
                mime = dotdump(mime)

                if mime in trusted_mimes:
                    data['type'] = trusted_mimes[mime]
                    tagged = True
                    break

        # As a third priority try matching the tl_patterns
        if not tagged:
            minimum = len(tl_patterns)
            sl_tag = None

            # Try each label and see how far down the tl_patterns list we go
            # before we hit a match, the closer to the beginning of the list we are the better
            # the tag match is. The final line of tl_patterns matches anything and sets
            # tag to 'unknown', so this loop should never finish with sl_tag as None
            # Unless the tl_patters table has been changed inappropriately
            for label in labels:
                label = dotdump(label)

                # ... match against our patterns and, ...
                index = 0
                for entry in tl_patterns:
                    if index >= minimum:
                        break

                    if entry[1].search(label):  # pylint:disable=E1101
                        break

                    index += 1

                # ... keep highest precedence (lowest index) match.
                if index < minimum:
                    minimum = index
                    sl_tag = subtype(label)

                    # If a label does match, take the best from that label
                    # Further labels from magic are probably terrible
                    break

            assert sl_tag is not None, "tl_patterns seems to be missing a match all => unknown rule at the end"

            # Based on the sub tag we found, figure out the top level tag to use
            tl_tag = sl_to_tl.get(sl_tag, tl_patterns[minimum][0])
            data['type'] = '/'.join((tl_tag, sl_tag))

    except Exception as e:
        print(str(e))
        pass

    if not recognized.get(data['type'], False):
        data['type'] = 'unknown'

    if data['type'] == 'document/office/unknown':
        # noinspection PyBroadException
        try:
            root_entry_property_offset = buf.find(
                u"Root Entry".encode("utf-16-le"))
            if -1 != root_entry_property_offset:
                # Get root entry's GUID and try to guess document type
                clsid_offset = root_entry_property_offset + 0x50
                if len(buf) >= clsid_offset + 16:
                    clsid = buf[clsid_offset:clsid_offset + 16]
                    if len(clsid) == 16 and clsid != "\0" * len(clsid):
                        clsid_str = uuid.UUID(bytes_le=clsid)
                        clsid_str = clsid_str.urn.rsplit(':', 1)[-1].upper()
                        if clsid_str in OLE_CLSID_GUIDs:
                            data['type'] = OLE_CLSID_GUIDs[clsid_str]
        except Exception:
            pass

    return data