def handle_update( db, id, version, ): print "doing it" doc = db.find_one(id) data = doc.raw_data magic = magic_buffer(data, mime=True) doc.update_plugin_and_canonical_attr('mimetype', magic) doc.finish_parsing(version) doc.reindex()
def calc_magic(stream): # Missing python-magic features: # - magic_descriptor (https://github.com/ahupp/python-magic/pull/227) # - direct support for symlink flag magic_cookie = magic.magic_open(magic.MAGIC_SYMLINK) magic.magic_load(magic_cookie, None) try: fd_path = get_fd_path(stream) if fd_path: return magic.maybe_decode(magic.magic_file(magic_cookie, fd_path)) else: # Handle BytesIO in-memory streams stream.seek(0, os.SEEK_SET) return magic.maybe_decode(magic.magic_buffer(magic_cookie, stream.read())) finally: magic.magic_close(magic_cookie) return None
def get_images_from_stream(stream): with NamedTemporaryFile() as inputfile: data = stream.read() magic = magic_buffer(data) if magic.startswith('JPEG') or magic.startswith('Netpbm'): return [data] elif magic.startswith('PDF'): tmpdir = mkdtemp() inputfile.write(data) inputfile.file.flush() try: call(["pdfimages", "-j", inputfile.name, tmpdir + '/']) return [file(path.join(tmpdir, x)).read() for x in listdir(tmpdir)] finally: rmtree(tmpdir) else: raise ValueError("Cannot handle file format:", magic)
def get_images_from_stream(stream): with NamedTemporaryFile() as inputfile: data = stream.read() magic = magic_buffer(data) if magic.startswith('JPEG') or magic.startswith('Netpbm'): return [data] elif magic.startswith('PDF'): tmpdir = mkdtemp() inputfile.write(data) inputfile.file.flush() try: call(["pdfimages", "-j", inputfile.name, tmpdir + '/']) return [ file(path.join(tmpdir, x)).read() for x in listdir(tmpdir) ] finally: rmtree(tmpdir) else: raise ValueError("Cannot handle file format:", magic)
def ident(buf, length): minimum = 1000 sl_tag = 'unknown' data = {'ascii': '', 'hex': '', 'magic': '', 'mime': '', 'tag': 'unknown'} if length <= 0: return data header = buf[:min(64, length)] data['ascii'] = dotdump(header) data['hex'] = hexlify(header) # noinspection PyBroadException try: # Loop over the labels returned by libmagic, ... labels = [] if file_type: with magic_lock: labels = magic.magic_buffer(file_type, buf).split('\n') mimes = [] if mime_type: with magic_lock: mimes = magic.magic_buffer(mime_type, buf).split('\n') for label, mime in zip(labels, mimes): label = dotdump(label) mime = dotdump(mime) if custom.match(label): data['magic'] = label data['mime'] = mime data['tag'] = label.split('custom: ')[1].strip() sl_tag = False break if mime in trusted_mimes: data['magic'] = label data['mime'] = mime data['tag'] = trusted_mimes[mime] sl_tag = False break # ... match against our patterns and, ... index = 0 for entry in tl_patterns: if index >= minimum: break if entry[1].search(label): # pylint:disable=E1101 break index += 1 # ... keep highest precedence match. if index < minimum: data['magic'] = label data['mime'] = mime minimum = index sl_tag = subtype(label) if sl_tag: tl_tag = sl_to_tl.get(sl_tag, tl_patterns[minimum][0]) data['tag'] = '/'.join((tl_tag, sl_tag)) except: # pylint:disable=W0702 pass if not recognized.get(data['tag'], False): data['tag'] = 'unknown' if data['tag'] == 'document/office/unknown': # noinspection PyBroadException try: root_entry_property_offset = buf.find(u"Root Entry".encode("utf-16-le")) if -1 != root_entry_property_offset: # Get root entry's GUID and try to guess document type clsid_offset = root_entry_property_offset + 0x50 if len(buf) >= clsid_offset + 16: clsid = buf[clsid_offset:clsid_offset + 16] if len(clsid) == 16 and clsid != "\0" * len(clsid): clsid_str = uuid.UUID(bytes_le=clsid) clsid_str = clsid_str.urn.rsplit(':', 1)[-1].upper() if clsid_str in OLE_CLSID_GUIDs: data['tag'] = OLE_CLSID_GUIDs[clsid_str] except: # pylint:disable=W0702 pass return data
def from_buffer(self, data, length): return magic.magic_buffer(self.cookie, data, length)
def ident(buf, length: int) -> Dict: data = { 'ascii': None, 'hex': None, 'magic': None, 'mime': None, 'type': 'unknown' } if length <= 0: return data header = buf[:min(64, length)] data['ascii'] = dotdump(header) data['hex'] = safe_str(hexlify(header)) # noinspection PyBroadException try: # Loop over the labels returned by libmagic, ... labels = [] if file_type: with magic_lock: labels = magic.magic_buffer(file_type, buf).split(b'\n') labels = [ label[2:] if label.startswith(b'- ') else label for label in labels ] mimes = [] if mime_type: with magic_lock: mimes = magic.magic_buffer(mime_type, buf).split(b'\n') mimes = [ mime[2:] if mime.startswith(b'- ') else mime for mime in mimes ] # For user feedback set the mime and magic meta data to always be the primary # libmagic responses if len(labels) > 0: data['magic'] = safe_str(labels[0]) if len(mimes) > 0 and mimes[0] != b'': data['mime'] = safe_str(mimes[0]) # Highest priority is given to mime type matching something tagged = False for label in labels: label = dotdump(label) if custom.match(label): data['type'] = label.split('custom: ')[1].strip() tagged = True break # Second priority is mime times marked as trusted if not tagged: for mime in mimes: mime = dotdump(mime) if mime in trusted_mimes: data['type'] = trusted_mimes[mime] tagged = True break # As a third priority try matching the tl_patterns if not tagged: minimum = len(tl_patterns) sl_tag = None # Try each label and see how far down the tl_patterns list we go # before we hit a match, the closer to the beginning of the list we are the better # the tag match is. The final line of tl_patterns matches anything and sets # tag to 'unknown', so this loop should never finish with sl_tag as None # Unless the tl_patters table has been changed inappropriately for label in labels: label = dotdump(label) # ... match against our patterns and, ... index = 0 for entry in tl_patterns: if index >= minimum: break if entry[1].search(label): # pylint:disable=E1101 break index += 1 # ... keep highest precedence (lowest index) match. if index < minimum: minimum = index sl_tag = subtype(label) # If a label does match, take the best from that label # Further labels from magic are probably terrible break assert sl_tag is not None, "tl_patterns seems to be missing a match all => unknown rule at the end" # Based on the sub tag we found, figure out the top level tag to use tl_tag = sl_to_tl.get(sl_tag, tl_patterns[minimum][0]) data['type'] = '/'.join((tl_tag, sl_tag)) except Exception as e: print(str(e)) pass if not recognized.get(data['type'], False): data['type'] = 'unknown' if data['type'] == 'document/office/unknown': # noinspection PyBroadException try: root_entry_property_offset = buf.find( u"Root Entry".encode("utf-16-le")) if -1 != root_entry_property_offset: # Get root entry's GUID and try to guess document type clsid_offset = root_entry_property_offset + 0x50 if len(buf) >= clsid_offset + 16: clsid = buf[clsid_offset:clsid_offset + 16] if len(clsid) == 16 and clsid != "\0" * len(clsid): clsid_str = uuid.UUID(bytes_le=clsid) clsid_str = clsid_str.urn.rsplit(':', 1)[-1].upper() if clsid_str in OLE_CLSID_GUIDs: data['type'] = OLE_CLSID_GUIDs[clsid_str] except Exception: pass return data