def sort_file(source_path: str, destination_path: str, start_count: int, file_name_length: int): """ 将指定文件夹内的所有文件排序重命名并复制到其他文件夹中 :Args: - source_path - 待排序文件所在目录 - destination_path - 排序后所复制的目录 - start_count - 重命名开始的序号 - file_name_length - 复制后的文件名长度 """ file_list = path.get_dir_files_name(source_path, path.RETURN_FILE_LIST_DESC) # 判断排序目标文件夹是否存在 if len(file_list) >= 1: if not path.create_dir(destination_path): return False # 倒叙排列 for file_name in file_list: start_count += 1 file_extension = os.path.splitext(file_name)[1] # 包括 .扩展名 new_file_name = str(("%0" + str(file_name_length) + "d") % start_count) + file_extension path.copy_file(os.path.join(source_path, file_name), os.path.join(destination_path, new_file_name)) # 删除临时文件夹 path.delete_dir_or_file(source_path) return True
def start_download(self): """ 主体下载逻辑 """ # 默认读取配置 if not isinstance(self.replace_if_exist, bool): self.replace_if_exist = DOWNLOAD_REPLACE_IF_EXIST # 同名文件已经存在,直接返回 if not self.replace_if_exist and os.path.exists( self.file_path) and os.path.getsize(self.file_path) > 0: output.print_msg(f"文件{self.file_path}({self.file_url})已存在,跳过") self.status = self.DOWNLOAD_SUCCEED return # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(self.file_path)): self.code = self.CODE_FILE_CREATE_FAILED return # 是否需要分段下载 self.check_auto_multipart_download() # 下载 for retry_count in range(0, NET_CONFIG["DOWNLOAD_RETRY_COUNT"]): if EXIT_FLAG: self.code = self.CODE_PROCESS_EXIT break if not self.is_multipart_download: # 单线程下载 if not self.single_download(): continue else: # 分段下载 if not self.multipart_download(): continue # 如果没有返回文件的长度,直接下载成功 if self.content_length == 0: self.status = self.DOWNLOAD_SUCCEED self.code = 0 return # 判断文件下载后的大小和response中的Content-Length是否一致 file_size = os.path.getsize(self.file_path) if self.content_length == file_size: self.status = self.DOWNLOAD_SUCCEED self.code = 0 return else: self.code = self.CODE_FILE_SIZE_INVALID output.print_msg( f"本地文件{self.file_path}:{self.content_length}和网络文件{self.file_url}:{file_size}不一致" ) time.sleep(NET_CONFIG["HTTP_REQUEST_RETRY_WAIT_TIME"]) # 删除可能出现的临时文件 path.delete_dir_or_file(self.file_path)
def rewrite_save_file(self): """ 将临时存档文件按照主键排序后写入原始存档文件 只支持一行一条记录,每条记录格式相同的存档文件 """ if self.temp_save_data_path: save_data = read_save_data(self.temp_save_data_path, 0, []) temp_list = [save_data[key] for key in sorted(save_data.keys())] file.write_file(tool.list_to_string(temp_list), self.save_data_path, file.WRITE_FILE_TYPE_REPLACE) path.delete_dir_or_file(self.temp_save_data_path)
def run(self): try: self._run() except KeyboardInterrupt: self.step("提前退出") except SystemExit as e: if e.code == tool.PROCESS_EXIT_CODE_ERROR: self.error("异常退出") else: self.step("提前退出") except Exception as e: self.error("未知异常") self.error(str(e) + "\n" + traceback.format_exc(), False) # 从住线程中移除主键对应的信息 if self.index_key: self.main_thread.save_data.pop(self.index_key) # 写入存档 if self.single_save_data and self.main_thread.temp_save_data_path: with self.thread_lock: file.write_file("\t".join(self.single_save_data), self.main_thread.temp_save_data_path, file.WRITE_FILE_TYPE_APPEND) # 主线程计数累加 if self.main_thread.is_download_photo: self.main_thread.total_photo_count += self.total_photo_count if self.main_thread.is_download_video: self.main_thread.total_video_count += self.total_video_count if self.main_thread.is_download_audio: self.main_thread.total_audio_count += self.total_audio_count # 清理临时文件(未完整下载的内容) for temp_path in self.temp_path_list: path.delete_dir_or_file(temp_path) # 日志 message = "下载完毕" download_result = [] if self.main_thread.is_download_photo: download_result.append(f"图片{self.total_photo_count}张") if self.main_thread.is_download_video: download_result.append(f"视频{self.total_video_count}个") if self.main_thread.is_download_audio: download_result.append(f"音频{self.total_audio_count}个") if download_result: message += ",共计下载" + ",".join(download_result) self.step(message) # 唤醒主线程 self.notify_main_thread()
def save_net_file_list(file_url_list, file_path, header_list=None, cookies_list=None): """Visit web and save to local(multiple remote resource, single local file) :param file_url_list: the list of remote resource URL which you want to save :param file_path: the local file path which you want to save remote resource :param header_list: customize header dictionary :param cookies_list: customize cookies dictionary, will replaced header_list["Cookie"] :return: status 0 download failure, 1 download successful code failure reason """ # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(file_path)): return False for retry_count in range(0, NET_CONFIG["DOWNLOAD_RETRY_COUNT"]): # 下载 with open(file_path, "wb") as file_handle: for file_url in file_url_list: response = http_request( file_url, header_list=header_list, cookies_list=cookies_list, connection_timeout=NET_CONFIG[ "DOWNLOAD_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["DOWNLOAD_READ_TIMEOUT"]) if response.status == HTTP_RETURN_CODE_SUCCEED: file_handle.write(response.data) # 超过重试次数,直接退出 elif response.status == HTTP_RETURN_CODE_RETRY: file_handle.close() path.delete_dir_or_file(file_path) return {"status": 0, "code": -2} # 其他http code,退出 else: file_handle.close() path.delete_dir_or_file(file_path) return {"status": 0, "code": response.status} return {"status": 1, "code": 0} # path.delete_dir_or_file(file_path) return {"status": 0, "code": -2}
def download_from_list(file_url_list, file_path, replace_if_exist=False, **kwargs): """ Visit web and save to local(multiple remote resource, single local file) :Args: - file_url_list - the list of remote resource URL which you want to save - file_path - the local file path which you want to save remote resource - replace_if_exist - not download if file is existed :Returns: - status - 0 download failure, 1 download successful - code - failure reason """ # 同名文件已经存在,直接返回 if not replace_if_exist and os.path.exists( file_path) and os.path.getsize(file_path) > 0: return True index = 1 part_file_path_list = [] is_succeed = False for file_url in file_url_list: # 临时文件路径 part_file_path = f"{file_path}.part{index}" if os.path.exists(os.path.realpath(part_file_path)): break part_file_path_list.append(part_file_path) # 下载 part_download_return = Download(file_url, part_file_path, replace_if_exist=replace_if_exist, **kwargs) if part_download_return.status == Download.DOWNLOAD_FAILED: break index += 1 else: with open(file_path, "wb") as file_handle: for part_file_path in part_file_path_list: with open(part_file_path, "rb") as part_file_handle: file_handle.write(part_file_handle.read()) is_succeed = True # 删除临时文件 for part_file_path in part_file_path_list: path.delete_dir_or_file(part_file_path) return is_succeed
def rewrite_save_file(temp_save_data_path: str, save_data_path: str): """ 将临时存档文件按照主键排序后写入原始存档文件 只支持一行一条记录,每条记录格式相同的存档文件 """ warnings.warn( "rewrite_save_file commands are deprecated.", DeprecationWarning, stacklevel=2, ) account_list = read_save_data(temp_save_data_path, 0, []) temp_list = [account_list[key] for key in sorted(account_list.keys())] file.write_file(tool.list_to_string(temp_list), save_data_path, file.WRITE_FILE_TYPE_REPLACE) path.delete_dir_or_file(temp_save_data_path)
def sort_file(source_path, destination_path, start_count, file_name_length): file_list = path.get_dir_files_name(source_path, path.RETURN_FILE_LIST_DESC) # 判断排序目标文件夹是否存在 if len(file_list) >= 1: if not path.create_dir(destination_path): return False # 倒叙排列 for file_name in file_list: start_count += 1 file_type = os.path.splitext(file_name)[1] # 包括 .扩展名 new_file_name = str(("%0" + str(file_name_length) + "d") % start_count) + file_type path.copy_files(os.path.join(source_path, file_name), os.path.join(destination_path, new_file_name)) # 删除临时文件夹 path.delete_dir_or_file(source_path) return True
def save_net_file(file_url, file_path, need_content_type=False, header_list=None, cookies_list=None, head_check=True): """Visit web and save to local :param file_url: the remote resource URL which you want to save :param file_path: the local file path which you want to save remote resource :param need_content_type: is auto rename file according to "Content-Type" in response headers :param header_list: customize header dictionary :param cookies_list: customize cookies dictionary, will replaced header_list["Cookie"] :param head_check: "HEAD" method request to check response status and file size before download file :return: status 0 download failure, 1 download successful code failure reason file_path finally local file path(when need_content_type is True, will rename it) """ file_path = path.change_path_encoding(file_path) # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(file_path)): return False create_file = False for retry_count in range(0, HTTP_DOWNLOAD_RETRY_COUNT): if head_check: request_method = "HEAD" else: request_method = "GET" # 获取头信息 response = http_request(file_url, request_method, header_list=header_list, cookies_list=cookies_list, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_READ_TIMEOUT) if response.status == HTTP_RETURN_CODE_SUCCEED: # todo 分段下载 # 判断文件是不是过大 content_length = response.getheader("Content-Length") if content_length is not None and int(content_length) > HTTP_DOWNLOAD_MAX_SIZE: return {"status": 0, "code": -4} # response中的Content-Type作为文件后缀名 if need_content_type: content_type = response.getheader("Content-Type") if content_type is not None and content_type != "octet-stream": if content_type == "video/quicktime": new_file_type = "mov" else: new_file_type = content_type.split("/")[-1] file_path = os.path.splitext(file_path)[0] + "." + new_file_type # 如果是先调用HEAD方法的,需要重新获取完整数据 if head_check: response = http_request(file_url, method="GET", header_list=header_list, cookies_list=cookies_list, connection_timeout=HTTP_DOWNLOAD_CONNECTION_TIMEOUT, read_timeout=HTTP_DOWNLOAD_READ_TIMEOUT) if response.status != HTTP_RETURN_CODE_SUCCEED: continue # 下载 with open(file_path, "wb") as file_handle: file_handle.write(response.data) create_file = True # 判断文件下载后的大小和response中的Content-Length是否一致 if content_length is None: return {"status": 1, "code": 0, "file_path": file_path} file_size = os.path.getsize(file_path) if int(content_length) == file_size: return {"status": 1, "code": 0, "file_path": file_path} else: output.print_msg("本地文件%s:%s和网络文件%s:%s不一致" % (file_path.encode("UTF-8"), content_length, str(file_url), file_size)) elif response.status == HTTP_RETURN_CODE_URL_INVALID: if create_file: path.delete_dir_or_file(file_path) return {"status": 0, "code": -1} # 超过重试次数,直接退出 elif response.status == HTTP_RETURN_CODE_RETRY: if create_file: path.delete_dir_or_file(file_path) return {"status": 0, "code": -2} # 其他http code,退出 else: if create_file: path.delete_dir_or_file(file_path) return {"status": 0, "code": response.status} if create_file: path.delete_dir_or_file(file_path) return {"status": 0, "code": -3}
def save_net_file(file_url, file_path, need_content_type=False, head_check=False, **kwargs): """Visit web and save to local :param file_url: the remote resource URL which you want to save :param file_path: the local file path which you want to save remote resource :param need_content_type: is auto rename file according to "Content-Type" in response headers :param head_check: "HEAD" method request to check response status and file size before download file :return: status 0 download failure, 1 download successful code failure reason file_path finally local file path(when need_content_type is True, will rename it) """ # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(file_path)): return False is_create_file = False is_multi_thread = False return_code = {"status": 0, "code": -3} for retry_count in range(0, NET_CONFIG["DOWNLOAD_RETRY_COUNT"]): if head_check and retry_count == 0: request_method = "HEAD" else: request_method = "GET" # 获取头信息 response = http_request(file_url, request_method, connection_timeout=NET_CONFIG["HTTP_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["HTTP_READ_TIMEOUT"], **kwargs) # 其他返回状态,退出 if response.status != HTTP_RETURN_CODE_SUCCEED: # URL格式不正确 if response.status == HTTP_RETURN_CODE_URL_INVALID: return_code = {"status": 0, "code": -1} # 超过重试次数 elif response.status == HTTP_RETURN_CODE_RETRY: return_code = {"status": 0, "code": -2} # 其他http code else: return_code = {"status": 0, "code": response.status} break # 判断文件是不是过大 content_length = response.getheader("Content-Length") if content_length is not None: content_length = int(content_length) # 超过限制 if content_length > NET_CONFIG["DOWNLOAD_LIMIT_SIZE"]: return {"status": 0, "code": -4} # 文件比较大,使用多线程下载(必须是head_check=True的情况下,否则整个文件内容都已经返回了) elif head_check and content_length > NET_CONFIG["DOWNLOAD_MULTI_THREAD_MIN_SIZE"]: is_multi_thread = True # response中的Content-Type作为文件后缀名 if need_content_type: content_type = response.getheader("Content-Type") if content_type is not None and content_type != "octet-stream": global MIME_DICTIONARY if MIME_DICTIONARY is None: MIME_DICTIONARY = tool.json_decode(file.read_file(os.path.join(os.path.dirname(__file__), "mime.json")), {}) if content_type in MIME_DICTIONARY: new_file_type = MIME_DICTIONARY[content_type] else: new_file_type = content_type.split("/")[-1] file_path = os.path.splitext(file_path)[0] + "." + new_file_type if not is_multi_thread: # 单线程下载 # 如果是先调用HEAD方法的,需要重新获取完整数据 if head_check: response = http_request(file_url, method="GET", connection_timeout=NET_CONFIG["DOWNLOAD_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["DOWNLOAD_READ_TIMEOUT"], **kwargs) if response.status != HTTP_RETURN_CODE_SUCCEED: continue # 下载 with open(file_path, "wb") as file_handle: is_create_file = True try: file_handle.write(response.data) except OSError as ose: if str(ose).find("No space left on device"): global EXIT_FLAG EXIT_FLAG = True raise else: # 多线程下载 # 单线程下载文件大小(100MB) multi_thread_block_size = int(math.ceil(content_length / 10 / SIZE_MB)) * SIZE_MB multi_thread_block_size = min(NET_CONFIG["DOWNLOAD_MULTI_THREAD_MIN_BLOCK_SIZE"], max(NET_CONFIG["DOWNLOAD_MULTI_THREAD_MAX_BLOCK_SIZE"], multi_thread_block_size)) # 创建文件 with open(file_path, "w"): is_create_file = True thread_list = [] error_flag = [] with open(file_path, "rb+") as file_handle: file_no = file_handle.fileno() end_pos = -1 while end_pos < content_length - 1: start_pos = end_pos + 1 end_pos = min(content_length - 1, start_pos + multi_thread_block_size - 1) # 创建一个副本 fd_handle = os.fdopen(os.dup(file_no), "rb+", -1) thread = MultiThreadDownload(file_url, start_pos, end_pos, fd_handle, error_flag) thread.start() thread_list.append(thread) # 等待所有线程下载完毕 for thread in thread_list: thread.join() # 有任意一个线程下载失败了,或者文件存在连续1K以上的空字节 if len(error_flag) > 0: continue if not _check_multi_thread_download_file(file_path): output.print_msg("网络文件%s多线程下载后发现无效字节" % file_url) continue if content_length is None: return {"status": 1, "code": 0, "file_path": file_path} # 判断文件下载后的大小和response中的Content-Length是否一致 file_size = os.path.getsize(file_path) if content_length == file_size: return {"status": 1, "code": 0, "file_path": file_path} else: output.print_msg("本地文件%s:%s和网络文件%s:%s不一致" % (file_path, content_length, file_url, file_size)) time.sleep(10) if is_create_file: path.delete_dir_or_file(file_path) return return_code
def rewrite_save_file(temp_save_data_path, save_data_path): account_list = read_save_data(temp_save_data_path, 0, []) temp_list = [account_list[key] for key in sorted(account_list.keys())] tool.write_file(tool.list_to_string(temp_list), save_data_path, tool.WRITE_FILE_TYPE_REPLACE) path.delete_dir_or_file(temp_save_data_path)
def clean_temp_path(self): for temp_path in self.temp_path_list: path.delete_dir_or_file(temp_path)
def download(file_url, file_path, recheck_file_extension=False, head_check=False, replace_if_exist: Optional[bool] = None, **kwargs): """ 现在远程文件到本地 :Args: - file_url - the remote resource URL which you want to save - file_path - the local file path which you want to save remote resource - recheck_file_extension - is auto rename file according to "Content-Type" in response headers - head_check -"HEAD" method request to check response status and file size before download file :Returns: - status - 0 download failure, 1 download successful - code - failure reason - file_path - finally local file path(when recheck_file_extension is True, will rename it) """ if not isinstance(replace_if_exist, bool): replace_if_exist = net.DOWNLOAD_REPLACE_IF_EXIST if not replace_if_exist and os.path.exists( file_path) and os.path.getsize(file_path) > 0: output.print_msg(f"文件{file_path}({file_url})已存在,跳过") return {"status": 1, "code": 0, "file_path": file_path} # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(file_path)): return {"status": 1, "code": -11, "file_path": file_path} is_create_file = False is_multi_thread = False return_code = {"status": 0, "code": -3} for retry_count in range(0, net.NET_CONFIG["DOWNLOAD_RETRY_COUNT"]): if head_check and retry_count == 0: request_method = "HEAD" else: request_method = "GET" # 获取头信息 response = net.request( file_url, request_method, is_check_qps=False, connection_timeout=net.NET_CONFIG["HTTP_CONNECTION_TIMEOUT"], read_timeout=net.NET_CONFIG["HTTP_READ_TIMEOUT"], **kwargs) # 其他返回状态,退出 if response.status != net.HTTP_RETURN_CODE_SUCCEED: # URL格式不正确 if response.status == net.HTTP_RETURN_CODE_URL_INVALID: return_code = {"status": 0, "code": -1} # 超过重试次数 elif response.status == net.HTTP_RETURN_CODE_RETRY: return_code = {"status": 0, "code": -2} # 其他http code else: return_code = {"status": 0, "code": response.status} break # 判断文件是不是过大 content_length = response.getheader("Content-Length") if content_length is not None: content_length = int(content_length) # 超过限制 if content_length > NET_CONFIG["DOWNLOAD_LIMIT_SIZE"]: return {"status": 0, "code": -4} # 文件比较大,使用多线程下载(必须是head_check=True的情况下,否则整个文件内容都已经返回了) elif head_check and content_length > NET_CONFIG[ "DOWNLOAD_MULTI_THREAD_MIN_SIZE"]: is_multi_thread = True # response中的Content-Type作为文件后缀名 if recheck_file_extension: content_type = response.getheader("Content-Type") if content_type is not None and content_type != "octet-stream": if net.MIME_DICTIONARY is None: net.MIME_DICTIONARY = tool.json_decode( file.read_file( os.path.join(os.path.dirname(__file__), "mime.json")), {}) if content_type in net.MIME_DICTIONARY: new_file_extension = net.MIME_DICTIONARY[content_type] else: new_file_extension = content_type.split("/")[-1] file_path = os.path.splitext( file_path)[0] + "." + new_file_extension if not is_multi_thread: # 单线程下载 # 如果是先调用HEAD方法的,需要重新获取完整数据 if head_check: response = net.request( file_url, method="GET", connection_timeout=net. NET_CONFIG["DOWNLOAD_CONNECTION_TIMEOUT"], read_timeout=net.NET_CONFIG["DOWNLOAD_READ_TIMEOUT"], **kwargs) if response.status != net.HTTP_RETURN_CODE_SUCCEED: continue # 下载 with open(file_path, "wb") as file_handle: is_create_file = True try: file_handle.write(response.data) except OSError as ose: if str(ose).find("No space left on device") != -1: net.EXIT_FLAG = True raise else: # 多线程下载 # 创建文件 with open(file_path, "w"): is_create_file = True thread_list = [] error_flag = [] with open(file_path, "rb+") as file_handle: file_no = file_handle.fileno() end_pos = -1 while end_pos < content_length - 1: start_pos = end_pos + 1 end_pos = min( content_length - 1, start_pos + NET_CONFIG["DOWNLOAD_MULTI_THREAD_BLOCK_SIZE"] - 1) # 创建一个副本 fd_handle = os.fdopen(os.dup(file_no), "rb+", -1) thread = MultiThreadDownload(file_url, start_pos, end_pos, fd_handle, error_flag) thread.start() thread_list.append(thread) # 等待所有线程下载完毕 for thread in thread_list: thread.join() # 有任意一个线程下载失败了,或者文件存在连续1K以上的空字节 if len(error_flag) > 0: continue if not _check_multi_thread_download_file(file_path): output.print_msg(f"网络文件{file_url}多线程下载后发现无效字节") continue if content_length is None: return {"status": 1, "code": 0, "file_path": file_path} # 判断文件下载后的大小和response中的Content-Length是否一致 file_size = os.path.getsize(file_path) if content_length == file_size: return {"status": 1, "code": 0, "file_path": file_path} else: output.print_msg( f"本地文件{file_path}:{content_length}和网络文件{file_url}:{file_size}不一致" ) time.sleep(net.NET_CONFIG["HTTP_REQUEST_RETRY_WAIT_TIME"]) if is_create_file: path.delete_dir_or_file(file_path) return return_code