def correct_file_extension(file_bytes: bytes, target_path): """ Identify the correct file type and extension based on the byte content of the file and change the target file extensions to the correct suffix. There is a limitation to what files can be recognised. See https://pypi.org/project/filetype/#supported-types Parameters ---------- file_bytes : bytes bytes read form the source file, this can be the entire file or a minimum of the first 261 bytes. target_path : Path Path object to the audio file that need to have the correct extension applied Returns ------- Path Pathlib Path to the audio file with the correct extension applied if file type was of a type that could be identified and corrected. """ kind_of_file = filetype.guess(file_bytes) if kind_of_file: correct_extension = f".{kind_of_file.extension}" target_path = target_path.with_suffix(correct_extension) return target_path
def test_conversion(self): """Test if the library can just convert video files and produce MP3 files.""" potato_mp3_path = convert_video(os.path.join(video_dir, "potato.mp4"), os.path.join(temp_dir, "potato.mp3")) # File should exist. assert (os.path.exists(potato_mp3_path)) # We should be able to open it. open(potato_mp3_path, 'r').close() # It should look like an MP3. assert filetype.guess(potato_mp3_path).extension == "mp3" # It should REALLY look like an MP3. assert (filetype.guess(potato_mp3_path).mime == "audio/mpeg")
def _validate_filetype(obj): """ Checks for valid file type and returns extension and mime type. Args: obj (str, bytes, bytearray or readable): Path to the file, a readable object or the file's content as str, bytes or bytearray. See filetype documentation: https://h2non.github.io/filetype.py/v1.0.0/filetype.m.html Returns: tuple: The file's extension and mime type. Raises: TypeError: Raised if type is invalid or could not be recognized. """ try: ft = filetype.guess(obj) except IOError: ft = None # Check for SVG content if filetype.guess fails if ft is None: # Test for readable object first try: content = obj.read() except AttributeError: # Test for file path try: with open(obj) as f: content = f.read() except Exception: # Use input as content as last option content = obj # Convert to string if isinstance(content, bytes) or isinstance(content, bytearray): content = content.decode('utf-8') # Check for SVG content if ImageRecord.SVG_PATTERN.search(content): result = ('svg', 'image/svg+xml') else: raise TypeError('Unrecognized file type') else: result = (ft.extension, ft.mime) # Check for extension in valid file types if result[0] not in ImageRecord.VALID_FILE_TYPES: raise TypeError('Invalid file type: {invalid}. Valid types are: {valid}.'.format( invalid=ft.extension, valid=ImageRecord.VALID_FILE_TYPES )) return result
def move_file_to_typed_direct(source_file, target_file, file_type): if os.path.isfile(target_file) or os.path.isfile(source_file): return filepath_set = set(os.listdir(source_file)) dir = os.path.dirname(source_file) for v in filepath_set: full_path = os.path.join(dir, v) if os.path.isfile(full_path): kind = filetype.guess(full_path) if kind and kind.EXTENSION == file_type: shutil.move(full_path, target_file)
def file_extension_from_bytes(file_bytes: bytes) -> Optional[str]: """ From file byte content identify the type of file, if recognised return the suffix extension, else returns None Parameters ========== file_bytes : bytes Returns ======= str or None Str - suffix including dot is file is recognised, e.g. '.jpg' None - File type was not recognised """ kind_of_file = filetype.guess(file_bytes) if kind_of_file: return f".{kind_of_file.extension}"
def download_file(url: str, ideal_filename: str = None, out_dir: str = None, headers: List[Tuple[str, str]] = None, with_progress_bar: bool = True, cache: bool = True, duplicate_handler: DuplicateHandler = DuplicateHandler.FIND_VALID_FILE, ignored_content_types: List[str] = None, max_filename_length=DEFAULT_MAX_FILENAME_LENGTH, group_by: GroupByMapping = None) -> DownloadedFile: download_cache = shelve.open(CACHE_WEBSITE_LINKS_FILE, writeback=True) if cache and url in download_cache: return download_cache[url] configure_urllib_opener(headers) filename = tempfile.TemporaryFile(delete=False).name file_download = download_file_impl(url, filename, download_cache, with_progress_bar=with_progress_bar) if not file_download: downloaded_file = add_to_download_cache(download_cache, url, result=DownloadedFileResult.FAIL) return downloaded_file if type(file_download) == DownloadedFile: return file_download if not file_download[0]: downloaded_file = add_to_download_cache(download_cache, url, result=DownloadedFileResult.FAIL) return downloaded_file url, old_url, res_headers = file_download content_type = get_content_type_from_headers(res_headers) if ignorable_content_type(ignored_content_types, content_type): downloaded_file = add_to_download_cache(download_cache, url, result=DownloadedFileResult.SKIPPED) return downloaded_file actual_name = None content_disposition = res_headers.get('Content-Disposition', failobj=None) if content_disposition: result = re.findall('filename="(.+)"', content_disposition) result = first_or_none(result) if get_file_extension(result): actual_name = result potential_filename = os.path.basename(urlparse(url).path) if not actual_name and potential_filename: if content_type: potential_mimetype = content_type else: potential_mimetype = mimetypes_extended.guess_type(potential_filename)[0] if potential_mimetype: ext = mimetypes_extended.guess_extension(potential_mimetype, include_period=True) if ext and potential_filename.endswith(ext): actual_name = potential_filename if not actual_name or not get_file_extension(actual_name): ext = None if content_type: ext = mimetypes_extended.guess_extension(content_type, include_period=True) if ext: if actual_name: actual_name += ext else: actual_name = get_filename(filename) + ext else: kind = filetype.guess(filename) if kind: actual_name = f'{get_filename(filename)}.{kind.extension}' if actual_name: if not out_dir: out_dir = str(Path(filename).parent) filename_split = split_filename(actual_name, include_ext_period=True) if ideal_filename and filename_split[1]: ideal_filename = os.path.splitext(ideal_filename)[0] actual_name = ideal_filename + filename_split[1] else: actual_name = get_filename_from_url(url) if not actual_name: actual_name = os.path.basename(filename) out_path = join_path(out_dir, filename=actual_name) if out_dir and group_by: directory, filename_only, ext = split_path_components(out_path, fatal=False, include_ext_period=True) sub_dir = f'/{group_by.fail_dir}' if ext in group_by: sub_dir = f'/{group_by[ext]}' filename_with_ext = join_filename_with_ext(filename_only, ext) out_path = join_path(directory, sub_dir, filename=filename_with_ext) out_path = shorten_file_name(out_path, max_length=max_filename_length) filename = move_file(filename, out_path, make_dirs=True, duplicate_handler=duplicate_handler) downloaded_file = add_to_download_cache(download_cache, url, old_url, headers=res_headers, filename=filename) download_cache.close() # synchronizes automatically return downloaded_file
def document_handler(update: Update, context: CallbackContext): """ This function has multiple scope of responsibility: 1. Downloading a document without extension; 2. Guessing a file type. If type isn't supported file removes from server; 3. Checking is this file type supported by application. If type isn't supported file removes from server; 4. Removing metadata from document via calling 'metadata_worker.py'; 2. Applying face hiding tool; 3. Sending cloaked and metadata-free file; 4. Removing original, metadata-free, and cloaked files from server. Yep, this function definitely should be refactored... """ logger = getLogger() is_faces_found = False logger.info("document_handler started") file = context.bot.getFile(update.message.document.file_id) file.download("documents/image") logger.info("Guessing file type") kind = filetype.guess("documents/image") if kind is None: logger.error("Cannot guess file type!") update.message.reply_text( "Cannot guess file type. This file type not supported") try: logger.info("Preparing for file deletion from server (kind guess)") remove("documents/image") update.message.reply_text("File successfully removed from server") except Exception as e: logger.error("Can't remove file (kind guess)") logger.error(e.args) update.message.reply_text("Error at removing file from server") return logger.info("File MIME type: %s", kind.mime) if kind.mime not in SUPPORTED_MIME_LIST: update.message.reply_text("{} not supported!".format(kind.mime)) logger.info("Removing file...") try: remove("documents/image") update.message.reply_text("File successfully removed from server") except Exception as e: logger.error("Can't remove file") logger.error(e.args) update.message.reply_text("Error at removing file from server") return else: logger.info("Metadata removing started") if kind.mime == "image/png": try: delete_metadata_from_png("documents/image") logger.info("Metadata was successfully deleted") except Exception as e: logger.error("Metadata wasn't deleted") logger.error(e.args) remove_original_doc_from_server(logger, update) update.message.reply_text( "Error at removing metadata from PNG file\nFile removed from server" ) remove_original_doc_from_server(logger, update) try: logger.info("Goes into fawkes section") update.message.reply_text("Applying face hider tools, wait...") _ = call(["fawkes", "-d", "documents", "--mode", FAWKES_MODE]) logger.info(result_of("ls documents")) if path.exists("documents/clean_image_cloaked.png"): is_faces_found = True logger.info("Does faces found?: %s", is_faces_found) logger.info("fawkes try-catch finished") except Exception as e: logger.error("EXCEPTION at fawkes section") logger.error(e.args) update.message.reply_text("Error at hiding faces") if is_faces_found: logger.info("Preparing for sending cloaked file") send_file(logger, update, context, "cloaked") logger.info("Preparing for clean file deletion on server") try: remove("documents/clean_image.png") update.message.reply_text( "Clean version of file successfully removed from server" ) logger.info("Clean version of file successfully removed") except Exception as e: logger.error("Can't remove clean version of file") logger.error(e.args) update.message.reply_text( "Error at removing clean version of file from server") logger.info("Preparing for cloaked photo deletion on server") try: remove("documents/clean_image_cloaked.png") update.message.reply_text( "Cloaked file successfully removed from server") logger.info("Cloaked file successfully removed") except Exception as e: logger.error("Can't remove cloaked file") logger.error(e.args) update.message.reply_text( "Error at removing cloaked file from server") else: logger.info("No faces found") update.message.reply_text("Can't find any faces") logger.info("Preparing for sending photo without metadata") send_file(logger, update, context, "clean") try: remove("documents/clean_image.png") update.message.reply_text( "File without metadata successfully removed from server" ) logger.info("File without metadata successfully removed") except Exception as e: logger.error("Can't remove file without metadata") logger.error(e.args) update.message.reply_text( "Error at removing file without metadata from server") else: delete_metadata("documents/image") logger.info("Metadata was successfully deleted") update.message.reply_text("Metadata was successfully deleted") remove_original_doc_from_server(logger, update) try: logger.info("Goes into fawkes section") update.message.reply_text("Applying face hider tools, wait...") _ = call(["fawkes", "-d", "documents", "--mode", FAWKES_MODE]) logger.info(result_of("ls documents")) if path.exists("documents/clean_image_cloaked.png"): is_faces_found = True logger.info("Does faces found?: %s", is_faces_found) logger.info("fawkes try-catch finished") except Exception as e: logger.error("EXCEPTION at fawkes section") logger.error(e.args) update.message.reply_text("Error at hiding faces") if is_faces_found: logger.info("Preparing for sending cloaked file") send_file(logger, update, context, "cloaked") logger.info("Preparing for clean file deletion on server") try: remove("documents/clean_image.jpg") update.message.reply_text( "Clean version of file successfully removed from server" ) logger.info("Clean version of file successfully removed") except Exception as e: logger.error("Can't remove clean version of file") logger.error(e.args) update.message.reply_text( "Error at removing clean version of file from server") logger.info("Preparing for cloaked photo deletion on server") try: remove("documents/clean_image_cloaked.png") update.message.reply_text( "Cloaked file successfully removed from server") logger.info("Cloaked file successfully removed") except Exception as e: logger.error("Can't remove cloaked file") logger.error(e.args) update.message.reply_text( "Error at removing cloaked file from server") else: logger.info("No faces found") update.message.reply_text("Can't find any faces") logger.info("Preparing for sending photo without metadata") send_file(logger, update, context, "clean") try: remove("documents/clean_image.jpg") update.message.reply_text( "File without metadata successfully removed from server" ) logger.info("File without metadata successfully removed") except Exception as e: logger.error("Can't remove file without metadata") logger.error(e.args) update.message.reply_text( "Error at removing file without metadata from server")
def does_file_mime_has(file, mime_keyword): guess = filetype.guess(file) return guess and mime_keyword in guess.mime
def get_filetype(cls, url): with urllib.request.urlopen(url) as f: return filetype.guess(f.read(261))