def create_tar(file_list): tmp_tar_file = NamedTemporaryFile(delete=False, suffix=".tgz") with tarfile.open(tmp_tar_file.name, "w:gz") as tar: while len(file_list) > 0: name = file_list[0] file_list.remove(name) skip_item = False for prefix in EXCLUDE_PREFIXES: if name.startswith(prefix): skip_item = True break if skip_item: continue tar.add(name) # Follow symbolic links if islink(name): link_target = os.readlink(name) link_target = join(dirname(name), link_target) if link_target not in file_list: file_list.append(link_target) else: if isfile(name): with open(name, "r") as f: magic_name = magic.detect_from_fobj(f).name dl = re.findall( "dynamically linked, interpreter ([^,]*)", magic_name) if len(dl) == 0 or dl in file_list: continue dl = dl[0] file_list.append(dl) return tmp_tar_file.name
def img(request, filename): try: im = Image.objects.get(name=filename) type = magic.detect_from_fobj(im.file).mime_type return HttpResponse(im.file, content_type=type) except Image.DoesNotExist: return defaults.page_not_found(request, None)
def test_detect_from_fobj(self): if SKIP_FROM_DESCRIPTOR: self.skipTest( "magic_descriptor is broken in this version of libmagic") with open(self.filename) as fobj: result = magic.detect_from_fobj(fobj) self.assert_result(result)
def _file_magic(file: File) -> FileMagic: """Returns the file magic namedtuple from the respective file.""" if isinstance(file, bytes): return detect_from_content(file[:1024]) # Fix issue #350. if isinstance(file, str): return _file_magic(Path(file)) if isinstance(file, Path): if file.is_file(): return detect_from_filename(str(file)) raise FileNotFoundError(str(file)) if isinstance(file, FILE_LIKE_OBJECTS): return detect_from_fobj(file) raise TypeError(f'Cannot read MIME type from {type(file)}.')
def open(self, url: str, mode: str = "rb") -> Any: """Returns a file-like object for a particular URL opened in mode. If the file is remote, it will be downloaded and locally cached """ urllib.request.install_opener( urllib.request.build_opener(*self._handlers)) try: fp = urllib.request.urlopen(url, context=self._context) except error.URLError as excp: if excp.args: # TODO: As of python3.7 this can be removed unverified_retrieval = ( hasattr(ssl, "SSLCertVerificationError") and isinstance(excp.args[0], ssl.SSLCertVerificationError) ) or (isinstance(excp.args[0], ssl.SSLError) and excp.args[0].reason == "CERTIFICATE_VERIFY_FAILED") if unverified_retrieval: vollog.warning( "SSL certificate verification failed: attempting UNVERIFIED retrieval" ) non_verifying_ctx = ssl.SSLContext() non_verifying_ctx.check_hostname = False non_verifying_ctx.verify_mode = ssl.CERT_NONE fp = urllib.request.urlopen(url, context=non_verifying_ctx) else: raise excp else: raise excp with contextlib.closing(fp) as fp: # Cache the file locally if not self.uses_cache(url): # ZipExtFiles (files in zips) cannot seek, so must be cached in order to use and/or decompress curfile = urllib.request.urlopen(url, context=self._context) else: # TODO: find a way to check if we already have this file (look at http headers?) block_size = 1028 * 8 temp_filename = os.path.join( constants.CACHE_PATH, "data_" + hashlib.sha512(bytes( url, 'raw_unicode_escape')).hexdigest() + ".cache") if not os.path.exists(temp_filename): vollog.debug("Caching file at: {}".format(temp_filename)) try: content_length = fp.info().get('Content-Length', -1) except AttributeError: # If our fp doesn't have an info member, carry on gracefully content_length = -1 cache_file = open(temp_filename, "wb") count = 0 block = fp.read(block_size) while block: count += len(block) if self._progress_callback: self._progress_callback( count * 100 / max(count, int(content_length)), "Reading file {}".format(url)) cache_file.write(block) block = fp.read(block_size) cache_file.close() # Re-open the cache with a different mode # Since we don't want people thinking they're able to save to the cache file, # open it in read mode only and allow breakages to happen if they wanted to write curfile = open(temp_filename, mode="rb") # Determine whether the file is a particular type of file, and if so, open it as such IMPORTED_MAGIC = False if HAS_MAGIC: stop = False while not stop: detected = None try: # Detect the content detected = magic.detect_from_fobj(curfile) IMPORTED_MAGIC = True # This is because python-magic and file provide a magic module # Only file's python has magic.detect_from_fobj except (AttributeError, IOError): pass if detected: if detected.mime_type == 'application/x-xz': curfile = cascadeCloseFile( lzma.LZMAFile(curfile, mode), curfile) elif detected.mime_type == 'application/x-bzip2': curfile = cascadeCloseFile(bz2.BZ2File(curfile, mode), curfile) elif detected.mime_type == 'application/x-gzip': curfile = cascadeCloseFile( gzip.GzipFile(fileobj=curfile, mode=mode), curfile) if detected.mime_type in [ 'application/x-xz', 'application/x-bzip2', 'application/x-gzip' ]: # Read and rewind to ensure we're inside any compressed file layers curfile.read(1) curfile.seek(0) else: stop = True else: stop = True if not IMPORTED_MAGIC: # Somewhat of a hack, but prevents a hard dependency on the magic module parsed_url = urllib.parse.urlparse(url) url_path = parsed_url.path stop = False while not stop: url_path_split = url_path.split(".") url_path_list, extension = url_path_split[:-1], url_path_split[ -1] url_path = ".".join(url_path_list) if extension == "xz": curfile = cascadeCloseFile(lzma.LZMAFile(curfile, mode), curfile) elif extension == "bz2": curfile = cascadeCloseFile(bz2.BZ2File(curfile, mode), curfile) elif extension == "gz": curfile = cascadeCloseFile( gzip.GzipFile(fileobj=curfile, mode=mode), curfile) else: stop = True # Fallback in case the file doesn't exist if curfile is None: raise ValueError("URL does not reference an openable file") return curfile
def open(self, url: str, mode: str = "rb") -> Any: """Returns a file-like object for a particular URL opened in mode. If the file is remote, it will be downloaded and locally cached """ urllib.request.install_opener( urllib.request.build_opener(*self._handlers)) try: fp = urllib.request.urlopen(url, context=self._context) except error.URLError as excp: if excp.args: if isinstance(excp.args[0], ssl.SSLCertVerificationError): vollog.warning( "SSL certificate verification failed: attempting UNVERIFIED retrieval" ) non_verifying_ctx = ssl.SSLContext() non_verifying_ctx.check_hostname = False non_verifying_ctx.verify_mode = ssl.CERT_NONE fp = urllib.request.urlopen(url, context=non_verifying_ctx) else: raise excp else: raise excp with contextlib.closing(fp) as fp: # Cache the file locally parsed_url = urllib.parse.urlparse(url) if parsed_url.scheme == 'file': # ZipExtFiles (files in zips) cannot seek, so must be cached in order to use and/or decompress curfile = urllib.request.urlopen(url, context=self._context) else: # TODO: find a way to check if we already have this file (look at http headers?) block_size = 1028 * 8 temp_filename = os.path.join( constants.CACHE_PATH, "data_" + hashlib.sha512(bytes(url, 'latin-1')).hexdigest()) if not os.path.exists(temp_filename): vollog.debug("Caching file at: {}".format(temp_filename)) try: content_length = fp.info().get('Content-Length', -1) except AttributeError: # If our fp doesn't have an info member, carry on gracefully content_length = -1 cache_file = open(temp_filename, "wb") count = 0 while True: block = fp.read(block_size) count += len(block) if not block: break if self._progress_callback: self._progress_callback( count * 100 / max(count, int(content_length)), "Reading file {}".format(url)) cache_file.write(block) cache_file.close() # Re-open the cache with a different mode curfile = open(temp_filename, mode="rb") # Determine whether the file is a particular type of file, and if so, open it as such IMPORTED_MAGIC = False if HAS_MAGIC: while True: detected = None try: # Detect the content detected = magic.detect_from_fobj(curfile) IMPORTED_MAGIC = True # This is because python-magic and file provide a magic module # Only file's python has magic.detect_from_fobj except AttributeError: pass except: pass if detected: if detected.mime_type == 'application/x-xz': curfile = lzma.LZMAFile(curfile, mode) elif detected.mime_type == 'application/x-bzip2': curfile = bz2.BZ2File(curfile, mode) elif detected.mime_type == 'application/x-gzip': curfile = gzip.GzipFile(fileobj=curfile, mode=mode) else: break else: break # Read and rewind to ensure we're inside any compressed file layers curfile.read(1) curfile.seek(0) if not IMPORTED_MAGIC: # Somewhat of a hack, but prevents a hard dependency on the magic module url_path = parsed_url.path while True: if url_path.endswith(".xz"): curfile = lzma.LZMAFile(curfile, mode) elif url_path.endswith(".bz2"): curfile = bz2.BZ2File(curfile, mode) elif url_path.endswith(".gz"): curfile = gzip.GzipFile(fileobj=curfile, mode=mode) else: break url_path = ".".join(url_path.split(".")[:-1]) # Fallback in case the file doesn't exist if curfile is None: raise ValueError("URL does not reference an openable file") return curfile
else: print "[verbosity]\tVerbosity level '%s' not known. Setted to 'normal'".format( args.verbosity_level[0]) settings.verbosity_level_numeric = 1 print "verbosity level: ", settings.verbosity_level_numeric for path, subdirs, files in os.walk(args.directory[0]): for name in files: if fnmatch(name, args.pattern[0]): total_files += 1 #Statistics try: ftype = magic.from_file(os.path.join(path, name), mime=True) except: f = open(os.path.join(path, name), "r") ftype = magic.detect_from_fobj(f).mime_type f.close() if ftype in type_stat: #Statistics type_stat[ftype] += 1 else: type_stat[ftype] = 1.0 #Metadata research for PDF if ftype == 'application/pdf': if settings.verbosity_level_numeric > 0: print "[analyzing]\t" + os.path.join(path, name) + " ..." current_book = { } #A dictionary containing all the data collected for the book. __TODO__ use a f****n' database current_book['path'] = os.path.join(path, name)
def test_detect_from_fobj(self): with open(self.filename) as fobj: result = magic.detect_from_fobj(fobj) self.assert_result(result)