def detect_local_source(path, content, mime_type=None, encoding=None): # TODO: may add sample_size filename = os.path.basename(path) parts = filename.split('.') extension = parts[-1] if len(parts) > 1 else None if magic is not None: detected = magic.detect_from_content(content) encoding = detected.encoding or encoding mime_name = detected.name mime_type = detected.mime_type or mime_type else: encoding = chardet.detect(content)['encoding'] or encoding mime_name = None mime_type = mime_type or mimetypes.guess_type(filename)[0] plugin_name = plugin_name_by_mime_type(mime_type, mime_name, extension) if encoding == 'binary': encoding = None return Source(uri=path, plugin_name=plugin_name, encoding=encoding)
def download_file(uri, verify_ssl): response = requests.get(uri, verify=verify_ssl) content = response.content if magic is not None: encoding = magic.detect_from_content(content).encoding else: encoding = response.encoding # TODO: try to guess with uri.split('/')[-1].split('.')[-1].lower() # TODO: try to guess with file-magic lib try: content_type = response.headers['content-type'] plugin_name = content_type.split('/')[-1].split(';')[0].lower() except (KeyError, IndexError): try: plugin_name = uri.split('/')[-1].split('.')[-1].lower() except IndexError: raise RuntimeError('Could not identify file type.') tmp = tempfile.NamedTemporaryFile() filename = '{}.{}'.format(tmp.name, plugin_name) tmp.close() with open(filename, 'wb') as fobj: fobj.write(content) return { 'filename': filename, 'encoding': encoding, }
def get_magics(path): """Get file format and encoding. The magic library is not really good at detecting text file-based format like CSV, JSON, YAML or, XML so we only use it to detect binary format and the encoding. Support both file-magic and magic as both as shipped under the same name in various distributions. """ if not MAGIC: return None, None with open(path, 'rb') as file: header = file.read(2048) res = magic.detect_from_content(header) mime_to_format = { 'application/pgp': 'gpg', 'application/x-sqlite3': 'sqlite3' } name_to_format = {'KDBX': 'kdbx', 'openssl': 'openssl', 'PGP': 'gpg'} frmt = mime_to_format.get(res.mime_type, None) for name in name_to_format: if name in res.name: frmt = name_to_format[name] encoding = None # res.encoding if 'UTF-8 Unicode (with BOM)' in res.name: encoding = 'utf-8-sig' return frmt, encoding
def readdir(self, path, fh): dirents = ['.', '..'] if path in '/': for gpx in self.track_dir: dirents.extend([gpx]) else: tid = self.track_dir[path[1:]][0] data = requests.get(_urlget % tid, auth=(self.user, self.password)) logging.debug(data.headers['Content-Disposition']) filename = re.sub(r'.*filename="([^"]+?)".*', r'\1', data.headers['Content-Disposition']) logging.debug('filename: {}'.format(filename)) extension = re.match(r'([^.]+)(\..*)$', filename).groups()[1] self.track_dir[path[1:]][1] = data detected_type = magic.detect_from_content(data.content).mime_type type2fun = {'application/x-bzip2': bz2.decompress, 'application/x-gzip': gzip.decompress, 'text/xml': bytes, 'text/plain': bytes} conv_fun = type2fun.get(detected_type, None) if conv_fun: timedate_data = re.search(b'<time>([^T]+)T([^:]+:[^:]+):', type2fun[detected_type](data.content)) date_data = timedate_data.group(1) time_data = timedate_data.group(2) else: time_data = b'notime' dirents.append('{0}_{1}_{2}{3}'.format(tid, date_data.decode('ascii'), time_data.decode('ascii'), extension)) for r in dirents: yield r
def get_content_type(response: 'Response') -> str: """Get content type from ``response``. Args: response (:class:`requests.Response`): Response object. Returns: The content type from ``response``. Note: If the ``Content-Type`` header is not defined in ``response``, the function will utilise |magic|_ to detect its content type. .. |Response| replace:: ``requests.Response``. .. _Response: https://requests.readthedocs.io/en/latest/api/index.html#requests.Response .. |magic| replace:: ``magic`` .. _magic: https://pypi.org/project/python-magic/ """ ct_type = response.headers.get('Content-Type') if ct_type is None: try: ct_type = magic.detect_from_content(response.content).mime_type except Exception: ct_type = '(null)' return ct_type.casefold().split(';', maxsplit=1)[0].strip()
def get_mimetype_and_encoding_for_content(content): """Function that returns the mime type and the encoding associated to a content buffer using the magic module under the hood. Args: content (bytes): a content buffer Returns: A tuple (mimetype, encoding), for instance ('text/plain', 'us-ascii'), associated to the provided content. """ while True: try: magic_result = magic.detect_from_content(content) mime_type = magic_result.mime_type encoding = magic_result.encoding break except Exception: # workaround an issue with the magic module who can fail # if detect_from_content is called multiple times in # a short amount of time reload(magic) return mime_type, encoding
def TypeOfBuffer(buf: bytearray, default: str = None) -> str: try: r = magic.detect_from_content(buf) return r.mime_type except: pass return default
def _getInstalledKernel(self): # Could we maybe remove the dependency for the "magic" module with a struct? # http://lxr.linux.no/#linux+v2.6.39/Documentation/x86/boot.txt # https://stackoverflow.com/a/11179559/733214 try: len(self.cfg) except TypeError: raise RuntimeError('Tried to find the isKernel with no config set up and parsed') for f in self.cfg.findall('{0}fileChecks/{0}file'.format(self.ns)): isKernel = (True if f.attrib.get('isKernel', 'false').lower() in ('true', '1') else False) if isKernel: self.kernelFile = f.text if self.kernelFile: with open(os.path.join('/boot', self.kernelFile), 'rb') as fh: magicname = magic.detect_from_content(fh.read()) names = [i.strip().split(None, 1) for i in magicname.name.split(',') if i.strip() != ''] for n in names: if len(n) != 2: continue k, v = n # Note: this only grabs the version number. # If we want to get e.g. the build user/machine, date, etc., # then we need to do a join. Shouldn't be necessary, though. if k.lower() == 'version': self.installedKernVer = v.split(None, 1)[0] return()
def download_file(uri, verify_ssl): response = requests.get(uri, verify=verify_ssl) content = response.content if magic is not None: encoding = magic.detect_from_content(content).encoding else: encoding = response.encoding # TODO: try to guess with uri.split('/')[-1].split('.')[-1].lower() # TODO: try to guess with file-magic lib try: content_type = response.headers['content-type'] plugin_name = content_type.split('/')[-1].split(';')[0].lower() except (KeyError, IndexError): try: plugin_name = uri.split('/')[-1].split('.')[-1].lower() except IndexError: raise RuntimeError('Could not identify file type.') tmp = tempfile.NamedTemporaryFile() filename = '{}.{}'.format(tmp.name, plugin_name) tmp.close() with open(filename, 'wb') as fobj: fobj.write(content) return {'filename': filename, 'encoding': encoding, }
def test_detect_from_content(self): # differ from upstream by opening file in binary mode, # this avoids hitting a bug in python3+libfile bindings # see https://github.com/ahupp/python-magic/issues/152 # for a similar issue with open(self.filename, 'rb') as fobj: result = magic.detect_from_content(fobj.read(4096)) self.assert_result(result)
def check_mime_type(data: bytes, valid_types: List[str]) -> bool: detected = magic.detect_from_content(data) mime_type = cast(str, detected.mime_type) if mime_type in valid_types: return True return False
def get_buffer_mime_type(buffer): try: if hasattr(magic, 'detect_from_content'): # Using file-magic module: https://github.com/file/file return magic.detect_from_content(buffer[:128]).mime_type else: # Using python-magic module: https://github.com/ahupp/python-magic return magic.from_buffer(buffer[:128], mime=True) except Exception: return ''
def magic_type(self, data, isdata=False): try: if isdata: magictype = magic.detect_from_content(data[0:512]).name else: magictype = magic.detect_from_filename(data).name except NameError: magictype = 'Error - file-magic library required.' except Exception as e: magictype = 'Error getting magic type - %s' % e return magictype
def guessWithMagic(content): result = magic.detect_from_content(content) return result.__dict__ # #worth to look into # #https://bitbucket.org/Telofy/utilofies/csvprofiler/0d8cdc3ae5a0a08e7fb5906d96f0d8e2284751d1/utilofies/bslib.py?at=master#cl-15 # def intelligent_decode(fname): # """ One problem remains in the latest version of UnicodeDammit, namely # that pages that have beautifully declared encodings but contain one # small erroneous byte sequence somewhere will fail to be decoded with # the mostly correct encodings, while Windows-1252 somehow succeeds, but # completely mucks up all umlauts and ligatures. Hence I want to remove # Windows-1252 from the potential encodings. # # I don't fall back on cchardet just yet. # """ # detector = bs4.dammit.EncodingDetector(fname) # # Fall back on forcing it to UTF-8 only if no other encodings # # could be found. (I use override_encodings for the HTTP encoding, # # which seems at least less reliable to me than the declared encoding.) # potential_encodings = \ # filter(bool, [detector.sniffed_encoding, detector.declared_encoding] # + list(detector.override_encodings)) \ # or ['utf-8'] # contains_replacement_characters = False # tried_encodings = [] # unicode_markup = None # original_encoding = None # for encoding in potential_encodings: # tried_encodings.append(encoding) # try: # unicode_markup = detector.markup.decode(encoding) # except Exception as excp: # #logger.info('Unsuccessfully tried encoding %s: %r', encoding, excp) # print 'Unsuccessfully tried encoding %s: %r', encoding, excp # if unicode_markup is not None: # original_encoding = encoding # break # if unicode_markup is None: # # Whatever! # unicode_markup = detector.markup.decode( # potential_encodings[0], 'replace') # original_encoding = potential_encodings[0] # contains_replacement_characters = True # return type(b'MockDammit', (object,), { # 'contains_replacement_characters': contains_replacement_characters, # 'original_encoding': original_encoding, # 'detector': detector, # 'is_html': detector.is_html, # 'markup': detector.markup, # 'tried_encodings': tried_encodings, # 'unicode_markup': unicode_markup})
def _index_content(self, fileid, path_to_file, mimetype, writer): """Index one file. """ if not mimetype in EXTRACTORS: content = "Missing extractor for {}".format(mimetype) else: with open(path_to_file, 'rb') as f: document_bytes = f.read() magic = detect_from_content(document_bytes) content = EXTRACTORS[mimetype](path_to_file, document_bytes, magic) writer.add_document(fileid=fileid, content=content)
def get_file_mime(fobj) -> str: """ :param fobj: - new/replace - django.core.files.uploadedfile.InMemoryUploadedFile (for newly created) - edit (w/o changing file itself) - django.core.files.base.File :return: file mimetype :note: https://stackoverflow.com/questions/4853581/django-get-uploaded-file-type-mimetype """ pos = fobj.tell() fobj.seek(0) mime = magic.detect_from_content(fobj.read(1024)).mime_type fobj.seek(pos) return mime
def constraint(value): if hasattr(magic, 'detect_from_content'): # Hack for temporary compatibility with both python 'magic' modules... mime_type = magic.detect_from_content(str(value)).mime_type else: mime_type = magic.from_buffer(str(value), mime=True) if mime_type in allowed_mime_types: return None else: return _("Detected data type %(detected)s. Expected %(expected)s.", detected=mime_type, expected=', '.join(allowed_mime_types))
def get_mime_type(data: bytes) -> typing.Union[typing.Tuple[str, str], typing.Tuple[None, None]]: """Get mime-type information based on the provided bytes object. Args: data: Binary data. Returns: typing.Tuple[str, str]: Identified mime information and mime-type. If **magic** is not available, returns *None, None*. E.g. *"ELF 64-bit LSB shared object, x86-64, version 1 (SYSV)", "application/x-sharedlib"* """ if magic is None: return None, None detected = magic.detect_from_content(data) return detected.name, detected.mime_type
def get_image_metadata_from_file(file_like): """ Receive a valid image file and returns a 2-tuple of two strings: [0]: Image format (i.e. 'jpg', 'gif' or 'png') [1]: InMemoryUploadedFile-friendly save format (i.e. 'image/jpeg') image_format, in_memory_file_type """ if hasattr(magic, 'from_buffer'): mime_type = magic.from_buffer(file_like.read(1024), mime=True) else: info = magic.detect_from_content(file_like.read(1024)) mime_type = info.mime_type file_like.seek(0) image_format = MIME_TYPE_TO_PIL_IDENTIFIER[mime_type] return image_format, mime_type
def _analyze_file(self, path_to_file): content = open(path_to_file, 'rb').read() md5hash = hashlib.md5(content).hexdigest() magic = detect_from_content(content) filetype = magic.mime_type orig_name = os.path.basename(path_to_file) _, orig_ext = os.path.splitext(orig_name) if filetype == 'application/octet-stream' and magic.name == 'Microsoft OOXML': if orig_ext == '.pptx': filetype = 'application/vnd.openxmlformats-officedocument.presentationml.presentation' elif orig_ext == '.docx': filetype = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' elif orig_ext == '.xlsx': filetype = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' return {'md5hash': md5hash, 'mimetype': filetype, 'text': orig_name}
def to_python(self, data): f = super().to_python(data) if f is None: return None if f.size > self.max_upload_size: raise forms.ValidationError(_('File is too big.'), code='size') content_type = magic.detect_from_content(f.read(1024)).mime_type if content_type not in self.content_types: raise forms.ValidationError( _('Filetype not supported.'), code='content_type' ) f.seek(0) return f
def get_magic_content_type(input): # pylint: disable=redefined-builtin """Get content-type based on magic library as *bytes* As libmagic bindings are provided via several 'magic' packages, we try them in order """ if magic is not None: if hasattr(input, 'seek'): input.seek(0) if hasattr(input, 'read'): input = input.read() if hasattr(magic, 'detect_from_content'): result = magic.detect_from_content(input) # pylint: disable=no-member if result: return result.mime_type elif hasattr(magic, 'from_buffer'): return magic.from_buffer(input, mime=True) return None
def get_file_kernel_ver(kpath): # Gets the version of a kernel file. kpath = os.path.abspath(os.path.expanduser(kpath)) _kinfo = {} with open(kpath, 'rb') as f: _m = magic.detect_from_content(f.read()) for i in _m.name.split(','): l = i.strip().split() # Note: this only grabs the version number. # If we want to get e.g. the build user/machine, date, etc., # then we need to join l[1:]. # We technically don't even need a dict, either. We can just iterate. # TODO. _kinfo[l[0].lower()] = (l[1] if len(l) > 1 else None) if 'version' not in _kinfo: raise RuntimeError( 'Cannot deterimine the version of {0}'.format(kpath)) else: return (_kinfo['version'])
def _file_magic(file: File) -> FileMagic: """Returns the file magic namedtuple from the respective file.""" if isinstance(file, bytes): return detect_from_content(file[:1024]) # Fix issue #350. if isinstance(file, str): return _file_magic(Path(file)) if isinstance(file, Path): if file.is_file(): return detect_from_filename(str(file)) raise FileNotFoundError(str(file)) if isinstance(file, FILE_LIKE_OBJECTS): return detect_from_fobj(file) raise TypeError(f'Cannot read MIME type from {type(file)}.')
def extract_images(child, namespaces, start=0): "Extract draw:image with binary-data and replace by href" import magic images = [] for i, image in enumerate( child.xpath('//draw:image', namespaces=namespaces), start): binary_data, = image.xpath('./office:binary-data', namespaces=namespaces) data = base64.b64decode(binary_data.text) if hasattr(magic, 'from_buffer'): mime_type = magic.from_buffer(data, mime=True) else: # Not python-magic but file-magic mime_type = magic.detect_from_content(data).mime_type name = 'Pictures/image%s%s' % (i, mimetypes.guess_extension(mime_type)) image.remove(binary_data) xlink_ns = namespaces['xlink'] image.attrib['{%s}href' % xlink_ns] = name images.append((name, data, mime_type)) return images
def get_magics(path): """Get file format and encoding. The magic library is not really good at detecting text file-based format like CSV, JSON, YAML or, XML so we only use it to detect binary format and the encoding. Support both file-magic and python-magic as both are shipped under the same name in various distributions. """ if not MAGIC: return None, None with open(path, 'rb') as file: header = file.read(2048) if hasattr(magic, 'detect_from_content'): # file-magic res = magic.detect_from_content(header) mime_type = res.mime_type magic_name = res.name else: # python-magic mime_type = magic.from_buffer(header, mime=True) magic_name = magic.from_buffer(header) mime_to_format = { 'application/pgp': 'gpg', 'application/x-sqlite3': 'sqlite3' } name_to_format = {'KDBX': 'kdbx', 'openssl': 'openssl', 'PGP': 'gpg'} frmt = mime_to_format.get(mime_type, None) for name, plain_format in name_to_format.items(): if name in magic_name: frmt = plain_format encoding = None if 'UTF-8 Unicode (with BOM)' in magic_name: encoding = 'utf-8-sig' return frmt, encoding
def extract_dir(data, start, outdir, file_number=65536, dir_number=65536): global header_fmt, header_len orig_start = start while (file_number > 0 or dir_number > 0) and start < len(data): header = struct.unpack(header_fmt, data[start:start + header_len]) start += header_len filename = data[start:start + header[0]] if b'\x00' in filename: # this means end too break filename = filename.decode("ascii") start += header[0] if magic.detect_from_content( data[start:start + header[3]]).mime_type != "application/zlib": # dir if dir_number == 0: raise Exception( "invalid directory number for directory \"%s\"" % (outdir)) n_file_number = header[3] // 0x10000 n_dir_number = header[3] & 0xffff n_path = os.path.join(outdir, filename) os.mkdir(n_path) start += extract_dir(data, start, n_path, n_file_number, n_dir_number) dir_number -= 1 else: # file if file_number == 0: raise Exception("invalid file number for directory \"%s\"" % (outdir)) filedata = data[start:start + header[3]] start += header[3] with open(os.path.join(outdir, filename), "wb") as f: f.write(zlib.decompress(filedata)) file_number -= 1 return start - orig_start
def get_file_type(filename_or_file): """ Get mime_type and encoding of file `filename_or_file`. Handles both magic libraries. :param filename_or_file: filename or open file :type filename_or_file: str or file :return: mime_type and encoding of `filename_or_file` :rtype: FileType """ if hasattr(filename_or_file, 'seek'): old_pos = filename_or_file.tell() txt = filename_or_file.read() filename_or_file.seek(old_pos) elif isinstance(filename_or_file, string_types): with open(filename_or_file, 'rb') as fp: txt = fp.read() else: raise ValueError( 'Argument "filename_or_file" has unknown type {!r}.'.format( type(filename_or_file))) if hasattr(magic, 'from_file'): mime = magic.Magic(mime=True, mime_encoding=True).from_buffer(txt) mime_type, charset = mime.split(';') encoding = charset.split('=')[-1] text = magic.Magic().from_buffer(txt) elif hasattr(magic, 'detect_from_filename'): fm = magic.detect_from_content(txt) mime_type = fm.mime_type encoding = fm.encoding text = fm.name else: raise RuntimeError('Unknown version or type of "magic" library.') # auto detect utf-8 with BOM if encoding == 'utf-8' and txt.startswith(codecs.BOM_UTF8): encoding = 'utf-8-sig' return FileType(mime_type, encoding, text)
def decompressor(self): # TODO: use mime module as fallback? # https://docs.python.org/3/library/mimetypes.html # VERY less-than-ideal since it won't work without self.args['logfile'] # (and has iffy detection at best, since it relies on file extensions). # Determine what decompressor to use, if we need to. if has_magic: _mime = magic.detect_from_content(self.data).mime_type self.decompress = cmprsn_map[_mime] if self.decompress: import importlib decmp = importlib.import_module(self.decompress) self.raw = decmp.decompress(self.data) else: # Assume that it's text and that it isn't compressed. # We'll get a UnicodeDecodeError exception if it isn't. pass try: self.raw = self.data.decode('utf-8') except UnicodeDecodeError: pass self.data = self.raw return ()
def detect_ext(self, data): import magic fmagic = magic.detect_from_content(data) if fmagic.mime_type.startswith('text/'): if fmagic.encoding == 'unknown-8bit': ext = '.bin' else: text = data.decode(fmagic.encoding) if '@return' in text or '*start' in text or '.ks' in text or '[w]' in text: ext = '.ks' elif '.tjs' in text or '%[' in text or '];' in text: ext = '.tjs' else: ext = '.txt' else: ext = mimetypes.guess_extension(fmagic.mime_type) if ext == '.jpeg': ext = '.jpg' elif ext == '.oga': ext = '.ogg' elif ext == '.asf': ext = '.wmv' return ext
def extract_dir(data, start, outdir, file_number=65536, dir_number=65536): global header_fmt, header_len orig_start = start while (file_number > 0 or dir_number > 0) and start < len(data): header = struct.unpack(header_fmt, data[start:start+header_len]) start += header_len filename = data[start:start+header[0]] if b'\x00' in filename: # this means end too break filename = filename.decode("ascii") start += header[0] if magic.detect_from_content(data[start:start+header[3]]).mime_type != "application/zlib": # dir if dir_number == 0: raise Exception("invalid directory number for directory \"%s\"" % (outdir)) n_file_number = header[3] // 0x10000 n_dir_number = header[3] & 0xffff n_path = os.path.join(outdir, filename) os.mkdir(n_path) start += extract_dir(data, start, n_path, n_file_number, n_dir_number) dir_number -= 1 else: # file if file_number == 0: raise Exception("invalid file number for directory \"%s\"" % (outdir)) filedata = data[start:start+header[3]] start += header[3] with open(os.path.join(outdir, filename), "wb") as f: f.write(zlib.decompress(filedata)) file_number -= 1 return start - orig_start
def test_detect_from_content(self): with open(self.filename) as fobj: result = magic.detect_from_content(fobj.read(4096)) self.assert_result(result)
def get_content_mimetype(content: bytes) -> str: """ MIME Type of content retrieved from magic headers """ detected_mime = magic.detect_from_content(content).mime_type return MIME_OVERRIDES.get(detected_mime, detected_mime)