Beispiel #1
0
def create_tar(file_list):

    tmp_tar_file = NamedTemporaryFile(delete=False, suffix=".tgz")
    with tarfile.open(tmp_tar_file.name, "w:gz") as tar:
        while len(file_list) > 0:
            name = file_list[0]
            file_list.remove(name)
            skip_item = False
            for prefix in EXCLUDE_PREFIXES:
                if name.startswith(prefix):
                    skip_item = True
                    break
            if skip_item:
                continue
            tar.add(name)
            # Follow symbolic links
            if islink(name):
                link_target = os.readlink(name)
                link_target = join(dirname(name), link_target)
                if link_target not in file_list:
                    file_list.append(link_target)
            else:
                if isfile(name):
                    with open(name, "r") as f:
                        magic_name = magic.detect_from_fobj(f).name
                        dl = re.findall(
                            "dynamically linked, interpreter ([^,]*)",
                            magic_name)
                        if len(dl) == 0 or dl in file_list:
                            continue
                        dl = dl[0]
                        file_list.append(dl)
    return tmp_tar_file.name
Beispiel #2
0
def img(request, filename):
    try:
        im = Image.objects.get(name=filename)
        type = magic.detect_from_fobj(im.file).mime_type
        return HttpResponse(im.file, content_type=type)
    except Image.DoesNotExist:
        return defaults.page_not_found(request, None)
    def test_detect_from_fobj(self):

        if SKIP_FROM_DESCRIPTOR:
            self.skipTest(
                "magic_descriptor is broken in this version of libmagic")

        with open(self.filename) as fobj:
            result = magic.detect_from_fobj(fobj)
        self.assert_result(result)
Beispiel #4
0
def _file_magic(file: File) -> FileMagic:
    """Returns the file magic namedtuple from the respective file."""

    if isinstance(file, bytes):
        return detect_from_content(file[:1024])  # Fix issue #350.

    if isinstance(file, str):
        return _file_magic(Path(file))

    if isinstance(file, Path):
        if file.is_file():
            return detect_from_filename(str(file))

        raise FileNotFoundError(str(file))

    if isinstance(file, FILE_LIKE_OBJECTS):
        return detect_from_fobj(file)

    raise TypeError(f'Cannot read MIME type from {type(file)}.')
Beispiel #5
0
    def open(self, url: str, mode: str = "rb") -> Any:
        """Returns a file-like object for a particular URL opened in mode.

        If the file is remote, it will be downloaded and locally cached
        """
        urllib.request.install_opener(
            urllib.request.build_opener(*self._handlers))

        try:
            fp = urllib.request.urlopen(url, context=self._context)
        except error.URLError as excp:
            if excp.args:
                # TODO: As of python3.7 this can be removed
                unverified_retrieval = (
                    hasattr(ssl, "SSLCertVerificationError")
                    and isinstance(excp.args[0], ssl.SSLCertVerificationError)
                ) or (isinstance(excp.args[0], ssl.SSLError)
                      and excp.args[0].reason == "CERTIFICATE_VERIFY_FAILED")
                if unverified_retrieval:
                    vollog.warning(
                        "SSL certificate verification failed: attempting UNVERIFIED retrieval"
                    )
                    non_verifying_ctx = ssl.SSLContext()
                    non_verifying_ctx.check_hostname = False
                    non_verifying_ctx.verify_mode = ssl.CERT_NONE
                    fp = urllib.request.urlopen(url, context=non_verifying_ctx)
                else:
                    raise excp
            else:
                raise excp

        with contextlib.closing(fp) as fp:
            # Cache the file locally

            if not self.uses_cache(url):
                # ZipExtFiles (files in zips) cannot seek, so must be cached in order to use and/or decompress
                curfile = urllib.request.urlopen(url, context=self._context)
            else:
                # TODO: find a way to check if we already have this file (look at http headers?)
                block_size = 1028 * 8
                temp_filename = os.path.join(
                    constants.CACHE_PATH,
                    "data_" + hashlib.sha512(bytes(
                        url, 'raw_unicode_escape')).hexdigest() + ".cache")

                if not os.path.exists(temp_filename):
                    vollog.debug("Caching file at: {}".format(temp_filename))

                    try:
                        content_length = fp.info().get('Content-Length', -1)
                    except AttributeError:
                        # If our fp doesn't have an info member, carry on gracefully
                        content_length = -1
                    cache_file = open(temp_filename, "wb")

                    count = 0
                    block = fp.read(block_size)
                    while block:
                        count += len(block)
                        if self._progress_callback:
                            self._progress_callback(
                                count * 100 / max(count, int(content_length)),
                                "Reading file {}".format(url))
                        cache_file.write(block)
                        block = fp.read(block_size)
                    cache_file.close()
                # Re-open the cache with a different mode
                # Since we don't want people thinking they're able to save to the cache file,
                # open it in read mode only and allow breakages to happen if they wanted to write
                curfile = open(temp_filename, mode="rb")

        # Determine whether the file is a particular type of file, and if so, open it as such
        IMPORTED_MAGIC = False
        if HAS_MAGIC:
            stop = False
            while not stop:
                detected = None
                try:
                    # Detect the content
                    detected = magic.detect_from_fobj(curfile)
                    IMPORTED_MAGIC = True
                    # This is because python-magic and file provide a magic module
                    # Only file's python has magic.detect_from_fobj
                except (AttributeError, IOError):
                    pass

                if detected:
                    if detected.mime_type == 'application/x-xz':
                        curfile = cascadeCloseFile(
                            lzma.LZMAFile(curfile, mode), curfile)
                    elif detected.mime_type == 'application/x-bzip2':
                        curfile = cascadeCloseFile(bz2.BZ2File(curfile, mode),
                                                   curfile)
                    elif detected.mime_type == 'application/x-gzip':
                        curfile = cascadeCloseFile(
                            gzip.GzipFile(fileobj=curfile, mode=mode), curfile)
                    if detected.mime_type in [
                            'application/x-xz', 'application/x-bzip2',
                            'application/x-gzip'
                    ]:
                        # Read and rewind to ensure we're inside any compressed file layers
                        curfile.read(1)
                        curfile.seek(0)
                    else:
                        stop = True
                else:
                    stop = True

        if not IMPORTED_MAGIC:
            # Somewhat of a hack, but prevents a hard dependency on the magic module
            parsed_url = urllib.parse.urlparse(url)
            url_path = parsed_url.path
            stop = False
            while not stop:
                url_path_split = url_path.split(".")
                url_path_list, extension = url_path_split[:-1], url_path_split[
                    -1]
                url_path = ".".join(url_path_list)
                if extension == "xz":
                    curfile = cascadeCloseFile(lzma.LZMAFile(curfile, mode),
                                               curfile)
                elif extension == "bz2":
                    curfile = cascadeCloseFile(bz2.BZ2File(curfile, mode),
                                               curfile)
                elif extension == "gz":
                    curfile = cascadeCloseFile(
                        gzip.GzipFile(fileobj=curfile, mode=mode), curfile)
                else:
                    stop = True

        # Fallback in case the file doesn't exist
        if curfile is None:
            raise ValueError("URL does not reference an openable file")
        return curfile
Beispiel #6
0
    def open(self, url: str, mode: str = "rb") -> Any:
        """Returns a file-like object for a particular URL opened in mode.

        If the file is remote, it will be downloaded and locally cached
        """
        urllib.request.install_opener(
            urllib.request.build_opener(*self._handlers))

        try:
            fp = urllib.request.urlopen(url, context=self._context)
        except error.URLError as excp:
            if excp.args:
                if isinstance(excp.args[0], ssl.SSLCertVerificationError):
                    vollog.warning(
                        "SSL certificate verification failed: attempting UNVERIFIED retrieval"
                    )
                    non_verifying_ctx = ssl.SSLContext()
                    non_verifying_ctx.check_hostname = False
                    non_verifying_ctx.verify_mode = ssl.CERT_NONE
                    fp = urllib.request.urlopen(url, context=non_verifying_ctx)
                else:
                    raise excp
            else:
                raise excp

        with contextlib.closing(fp) as fp:
            # Cache the file locally
            parsed_url = urllib.parse.urlparse(url)

            if parsed_url.scheme == 'file':
                # ZipExtFiles (files in zips) cannot seek, so must be cached in order to use and/or decompress
                curfile = urllib.request.urlopen(url, context=self._context)
            else:
                # TODO: find a way to check if we already have this file (look at http headers?)
                block_size = 1028 * 8
                temp_filename = os.path.join(
                    constants.CACHE_PATH, "data_" +
                    hashlib.sha512(bytes(url, 'latin-1')).hexdigest())

                if not os.path.exists(temp_filename):
                    vollog.debug("Caching file at: {}".format(temp_filename))

                    try:
                        content_length = fp.info().get('Content-Length', -1)
                    except AttributeError:
                        # If our fp doesn't have an info member, carry on gracefully
                        content_length = -1
                    cache_file = open(temp_filename, "wb")

                    count = 0
                    while True:
                        block = fp.read(block_size)
                        count += len(block)
                        if not block:
                            break
                        if self._progress_callback:
                            self._progress_callback(
                                count * 100 / max(count, int(content_length)),
                                "Reading file {}".format(url))
                        cache_file.write(block)
                    cache_file.close()
                # Re-open the cache with a different mode
                curfile = open(temp_filename, mode="rb")

        # Determine whether the file is a particular type of file, and if so, open it as such
        IMPORTED_MAGIC = False
        if HAS_MAGIC:
            while True:
                detected = None
                try:
                    # Detect the content
                    detected = magic.detect_from_fobj(curfile)
                    IMPORTED_MAGIC = True
                    # This is because python-magic and file provide a magic module
                    # Only file's python has magic.detect_from_fobj
                except AttributeError:
                    pass
                except:
                    pass

                if detected:
                    if detected.mime_type == 'application/x-xz':
                        curfile = lzma.LZMAFile(curfile, mode)
                    elif detected.mime_type == 'application/x-bzip2':
                        curfile = bz2.BZ2File(curfile, mode)
                    elif detected.mime_type == 'application/x-gzip':
                        curfile = gzip.GzipFile(fileobj=curfile, mode=mode)
                    else:
                        break
                else:
                    break

                # Read and rewind to ensure we're inside any compressed file layers
                curfile.read(1)
                curfile.seek(0)
        if not IMPORTED_MAGIC:
            # Somewhat of a hack, but prevents a hard dependency on the magic module
            url_path = parsed_url.path
            while True:
                if url_path.endswith(".xz"):
                    curfile = lzma.LZMAFile(curfile, mode)
                elif url_path.endswith(".bz2"):
                    curfile = bz2.BZ2File(curfile, mode)
                elif url_path.endswith(".gz"):
                    curfile = gzip.GzipFile(fileobj=curfile, mode=mode)
                else:
                    break
                url_path = ".".join(url_path.split(".")[:-1])

        # Fallback in case the file doesn't exist
        if curfile is None:
            raise ValueError("URL does not reference an openable file")
        return curfile
Beispiel #7
0
else:
    print "[verbosity]\tVerbosity level '%s' not known. Setted to 'normal'".format(
        args.verbosity_level[0])
    settings.verbosity_level_numeric = 1
    print "verbosity level: ", settings.verbosity_level_numeric

for path, subdirs, files in os.walk(args.directory[0]):
    for name in files:
        if fnmatch(name, args.pattern[0]):
            total_files += 1  #Statistics

            try:
                ftype = magic.from_file(os.path.join(path, name), mime=True)
            except:
                f = open(os.path.join(path, name), "r")
                ftype = magic.detect_from_fobj(f).mime_type
                f.close()

            if ftype in type_stat:  #Statistics
                type_stat[ftype] += 1
            else:
                type_stat[ftype] = 1.0

            #Metadata research for PDF
            if ftype == 'application/pdf':
                if settings.verbosity_level_numeric > 0:
                    print "[analyzing]\t" + os.path.join(path, name) + " ..."

                current_book = {
                }  #A dictionary containing all the data collected for the book. __TODO__ use a f****n' database
                current_book['path'] = os.path.join(path, name)
Beispiel #8
0
 def test_detect_from_fobj(self):
     with open(self.filename) as fobj:
         result = magic.detect_from_fobj(fobj)
     self.assert_result(result)