def correct_file_extension(file_bytes: bytes, target_path):
    """
    Identify the correct file type and extension based on the byte content of the file and
    change the target file extensions to the correct suffix.

    There is a limitation to what files can be recognised.  See https://pypi.org/project/filetype/#supported-types

    Parameters
    ----------
    file_bytes : bytes
        bytes read form the source file, this can be the entire file or a minimum of the first 261 bytes.
    target_path : Path
        Path object to the audio file that need to have the correct extension applied

    Returns
    -------
    Path
        Pathlib Path to the audio file with the correct extension applied if file type was of a type that could be
        identified and corrected.

    """
    kind_of_file = filetype.guess(file_bytes)
    if kind_of_file:
        correct_extension = f".{kind_of_file.extension}"
        target_path = target_path.with_suffix(correct_extension)
    return target_path
Ejemplo n.º 2
0
    def test_conversion(self):
        """Test if the library can just convert video files and produce MP3 files."""

        potato_mp3_path = convert_video(os.path.join(video_dir, "potato.mp4"),
                                        os.path.join(temp_dir, "potato.mp3"))

        # File should exist.
        assert (os.path.exists(potato_mp3_path))

        # We should be able to open it.
        open(potato_mp3_path, 'r').close()

        # It should look like an MP3.
        assert filetype.guess(potato_mp3_path).extension == "mp3"

        # It should REALLY look like an MP3.
        assert (filetype.guess(potato_mp3_path).mime == "audio/mpeg")
Ejemplo n.º 3
0
    def _validate_filetype(obj):
        """
        Checks for valid file type and returns extension and mime type.

        Args:
            obj (str, bytes, bytearray or readable): Path to the file, a readable object or the file's content
                as str, bytes or bytearray. See filetype documentation:
                https://h2non.github.io/filetype.py/v1.0.0/filetype.m.html

        Returns:
            tuple: The file's extension and mime type.

        Raises:
            TypeError: Raised if type is invalid or could not be recognized.
        """

        try:
            ft = filetype.guess(obj)
        except IOError:
            ft = None

        # Check for SVG content if filetype.guess fails
        if ft is None:

            # Test for readable object first
            try:
                content = obj.read()
            except AttributeError:
                # Test for file path
                try:
                    with open(obj) as f:
                        content = f.read()
                except Exception:
                    # Use input as content as last option
                    content = obj

            # Convert to string
            if isinstance(content, bytes) or isinstance(content, bytearray):
                content = content.decode('utf-8')

            # Check for SVG content
            if ImageRecord.SVG_PATTERN.search(content):
                result = ('svg', 'image/svg+xml')
            else:
                raise TypeError('Unrecognized file type')

        else:
            result = (ft.extension, ft.mime)

        # Check for extension in valid file types
        if result[0] not in ImageRecord.VALID_FILE_TYPES:
            raise TypeError('Invalid file type: {invalid}. Valid types are: {valid}.'.format(
                invalid=ft.extension,
                valid=ImageRecord.VALID_FILE_TYPES
            ))

        return result
Ejemplo n.º 4
0
def move_file_to_typed_direct(source_file, target_file, file_type):
    if os.path.isfile(target_file) or os.path.isfile(source_file):
        return
    filepath_set = set(os.listdir(source_file))
    dir = os.path.dirname(source_file)
    for v in filepath_set:
        full_path = os.path.join(dir, v)
        if os.path.isfile(full_path):
            kind = filetype.guess(full_path)
            if kind and kind.EXTENSION == file_type:
                shutil.move(full_path, target_file)
def file_extension_from_bytes(file_bytes: bytes) -> Optional[str]:
    """
    From file byte content identify the type of file, if recognised return the suffix extension, else returns None

    Parameters
    ==========
    file_bytes : bytes

    Returns
    =======
    str or None
        Str - suffix including dot is file is recognised, e.g. '.jpg'
        None - File type was not recognised

    """
    kind_of_file = filetype.guess(file_bytes)
    if kind_of_file:
        return f".{kind_of_file.extension}"
Ejemplo n.º 6
0
def download_file(url: str, ideal_filename: str = None, out_dir: str = None, headers: List[Tuple[str, str]] = None, with_progress_bar: bool = True,
                  cache: bool = True, duplicate_handler: DuplicateHandler = DuplicateHandler.FIND_VALID_FILE, ignored_content_types: List[str] = None,
                  max_filename_length=DEFAULT_MAX_FILENAME_LENGTH, group_by: GroupByMapping = None) -> DownloadedFile:
    download_cache = shelve.open(CACHE_WEBSITE_LINKS_FILE, writeback=True)

    if cache and url in download_cache:
        return download_cache[url]

    configure_urllib_opener(headers)

    filename = tempfile.TemporaryFile(delete=False).name
    file_download = download_file_impl(url, filename, download_cache, with_progress_bar=with_progress_bar)

    if not file_download:
        downloaded_file = add_to_download_cache(download_cache, url, result=DownloadedFileResult.FAIL)
        return downloaded_file

    if type(file_download) == DownloadedFile:
        return file_download

    if not file_download[0]:
        downloaded_file = add_to_download_cache(download_cache, url, result=DownloadedFileResult.FAIL)
        return downloaded_file

    url, old_url, res_headers = file_download

    content_type = get_content_type_from_headers(res_headers)
    if ignorable_content_type(ignored_content_types, content_type):
        downloaded_file = add_to_download_cache(download_cache, url, result=DownloadedFileResult.SKIPPED)
        return downloaded_file

    actual_name = None
    content_disposition = res_headers.get('Content-Disposition', failobj=None)
    if content_disposition:
        result = re.findall('filename="(.+)"', content_disposition)
        result = first_or_none(result)

        if get_file_extension(result):
            actual_name = result

    potential_filename = os.path.basename(urlparse(url).path)
    if not actual_name and potential_filename:
        if content_type:
            potential_mimetype = content_type
        else:
            potential_mimetype = mimetypes_extended.guess_type(potential_filename)[0]

        if potential_mimetype:
            ext = mimetypes_extended.guess_extension(potential_mimetype, include_period=True)

            if ext and potential_filename.endswith(ext):
                actual_name = potential_filename

    if not actual_name or not get_file_extension(actual_name):
        ext = None
        if content_type:
            ext = mimetypes_extended.guess_extension(content_type, include_period=True)

        if ext:
            if actual_name:
                actual_name += ext
            else:
                actual_name = get_filename(filename) + ext
        else:
            kind = filetype.guess(filename)

            if kind:
                actual_name = f'{get_filename(filename)}.{kind.extension}'

    if actual_name:
        if not out_dir:
            out_dir = str(Path(filename).parent)

        filename_split = split_filename(actual_name, include_ext_period=True)

        if ideal_filename and filename_split[1]:
            ideal_filename = os.path.splitext(ideal_filename)[0]

            actual_name = ideal_filename + filename_split[1]
    else:
        actual_name = get_filename_from_url(url)

        if not actual_name:
            actual_name = os.path.basename(filename)

    out_path = join_path(out_dir, filename=actual_name)

    if out_dir and group_by:
        directory, filename_only, ext = split_path_components(out_path, fatal=False, include_ext_period=True)

        sub_dir = f'/{group_by.fail_dir}'
        if ext in group_by:
            sub_dir = f'/{group_by[ext]}'

        filename_with_ext = join_filename_with_ext(filename_only, ext)
        out_path = join_path(directory, sub_dir, filename=filename_with_ext)

    out_path = shorten_file_name(out_path, max_length=max_filename_length)
    filename = move_file(filename, out_path, make_dirs=True, duplicate_handler=duplicate_handler)
    downloaded_file = add_to_download_cache(download_cache, url, old_url, headers=res_headers, filename=filename)

    download_cache.close()  # synchronizes automatically

    return downloaded_file
Ejemplo n.º 7
0
def document_handler(update: Update, context: CallbackContext):
    """
        This function has multiple scope of responsibility:
        1. Downloading a document without extension;
        2. Guessing a file type. If type isn't supported file removes from server;
        3. Checking is this file type supported by application. If type isn't supported file removes from server;
        4. Removing metadata from document via calling 'metadata_worker.py';
        2. Applying face hiding tool;
        3. Sending cloaked and metadata-free file;
        4. Removing original, metadata-free, and cloaked files from server.
        Yep, this function definitely should be refactored...
        """
    logger = getLogger()
    is_faces_found = False

    logger.info("document_handler started")
    file = context.bot.getFile(update.message.document.file_id)
    file.download("documents/image")

    logger.info("Guessing file type")
    kind = filetype.guess("documents/image")
    if kind is None:
        logger.error("Cannot guess file type!")
        update.message.reply_text(
            "Cannot guess file type. This file type not supported")

        try:
            logger.info("Preparing for file deletion from server (kind guess)")
            remove("documents/image")
            update.message.reply_text("File successfully removed from server")
        except Exception as e:
            logger.error("Can't remove file (kind guess)")
            logger.error(e.args)
            update.message.reply_text("Error at removing file from server")
        return

    logger.info("File MIME type: %s", kind.mime)

    if kind.mime not in SUPPORTED_MIME_LIST:
        update.message.reply_text("{} not supported!".format(kind.mime))
        logger.info("Removing file...")
        try:
            remove("documents/image")
            update.message.reply_text("File successfully removed from server")
        except Exception as e:
            logger.error("Can't remove file")
            logger.error(e.args)
            update.message.reply_text("Error at removing file from server")
        return
    else:
        logger.info("Metadata removing started")
        if kind.mime == "image/png":
            try:
                delete_metadata_from_png("documents/image")
                logger.info("Metadata was successfully deleted")
            except Exception as e:
                logger.error("Metadata wasn't deleted")
                logger.error(e.args)
                remove_original_doc_from_server(logger, update)
                update.message.reply_text(
                    "Error at removing metadata from PNG file\nFile removed from server"
                )

            remove_original_doc_from_server(logger, update)

            try:
                logger.info("Goes into fawkes section")
                update.message.reply_text("Applying face hider tools, wait...")
                _ = call(["fawkes", "-d", "documents", "--mode", FAWKES_MODE])
                logger.info(result_of("ls documents"))
                if path.exists("documents/clean_image_cloaked.png"):
                    is_faces_found = True

                logger.info("Does faces found?: %s", is_faces_found)
                logger.info("fawkes try-catch finished")

            except Exception as e:
                logger.error("EXCEPTION at fawkes section")
                logger.error(e.args)
                update.message.reply_text("Error at hiding faces")

            if is_faces_found:
                logger.info("Preparing for sending cloaked file")

                send_file(logger, update, context, "cloaked")

                logger.info("Preparing for clean file deletion on server")
                try:
                    remove("documents/clean_image.png")
                    update.message.reply_text(
                        "Clean version of file successfully removed from server"
                    )
                    logger.info("Clean version of file successfully removed")
                except Exception as e:
                    logger.error("Can't remove clean version of file")
                    logger.error(e.args)
                    update.message.reply_text(
                        "Error at removing clean version of file from server")

                logger.info("Preparing for cloaked photo deletion on server")
                try:
                    remove("documents/clean_image_cloaked.png")
                    update.message.reply_text(
                        "Cloaked file successfully removed from server")
                    logger.info("Cloaked file successfully removed")
                except Exception as e:
                    logger.error("Can't remove cloaked file")
                    logger.error(e.args)
                    update.message.reply_text(
                        "Error at removing cloaked file from server")
            else:
                logger.info("No faces found")
                update.message.reply_text("Can't find any faces")
                logger.info("Preparing for sending photo without metadata")

                send_file(logger, update, context, "clean")

                try:
                    remove("documents/clean_image.png")
                    update.message.reply_text(
                        "File without metadata successfully removed from server"
                    )
                    logger.info("File without metadata successfully removed")
                except Exception as e:
                    logger.error("Can't remove file without metadata")
                    logger.error(e.args)
                    update.message.reply_text(
                        "Error at removing file without metadata from server")

        else:
            delete_metadata("documents/image")
            logger.info("Metadata was successfully deleted")
            update.message.reply_text("Metadata was successfully deleted")

            remove_original_doc_from_server(logger, update)

            try:
                logger.info("Goes into fawkes section")
                update.message.reply_text("Applying face hider tools, wait...")
                _ = call(["fawkes", "-d", "documents", "--mode", FAWKES_MODE])
                logger.info(result_of("ls documents"))
                if path.exists("documents/clean_image_cloaked.png"):
                    is_faces_found = True

                logger.info("Does faces found?: %s", is_faces_found)
                logger.info("fawkes try-catch finished")

            except Exception as e:
                logger.error("EXCEPTION at fawkes section")
                logger.error(e.args)
                update.message.reply_text("Error at hiding faces")

            if is_faces_found:
                logger.info("Preparing for sending cloaked file")

                send_file(logger, update, context, "cloaked")

                logger.info("Preparing for clean file deletion on server")
                try:
                    remove("documents/clean_image.jpg")
                    update.message.reply_text(
                        "Clean version of file successfully removed from server"
                    )
                    logger.info("Clean version of file successfully removed")
                except Exception as e:
                    logger.error("Can't remove clean version of file")
                    logger.error(e.args)
                    update.message.reply_text(
                        "Error at removing clean version of file from server")

                logger.info("Preparing for cloaked photo deletion on server")
                try:
                    remove("documents/clean_image_cloaked.png")
                    update.message.reply_text(
                        "Cloaked file successfully removed from server")
                    logger.info("Cloaked file successfully removed")
                except Exception as e:
                    logger.error("Can't remove cloaked file")
                    logger.error(e.args)
                    update.message.reply_text(
                        "Error at removing cloaked file from server")
            else:
                logger.info("No faces found")
                update.message.reply_text("Can't find any faces")
                logger.info("Preparing for sending photo without metadata")

                send_file(logger, update, context, "clean")

                try:
                    remove("documents/clean_image.jpg")
                    update.message.reply_text(
                        "File without metadata successfully removed from server"
                    )
                    logger.info("File without metadata successfully removed")
                except Exception as e:
                    logger.error("Can't remove file without metadata")
                    logger.error(e.args)
                    update.message.reply_text(
                        "Error at removing file without metadata from server")
Ejemplo n.º 8
0
def does_file_mime_has(file, mime_keyword):
    guess = filetype.guess(file)
    return guess and mime_keyword in guess.mime
Ejemplo n.º 9
0
 def get_filetype(cls, url):
     with urllib.request.urlopen(url) as f:
         return filetype.guess(f.read(261))