def write_file(msg: str, file_path: str, append_type: int = WRITE_FILE_TYPE_APPEND, encoding: str = "UTF-8"): """ 写入文件 :Args: - file_path: - 需要写入文件的路径 - append_type - 写入模式 WRITE_FILE_TYPE_APPEND "a" mode to write file WRITE_FILE_TYPE_REPLACE "w" mode to write file :Returns: READ_FILE_TYPE_FULL type of string READ_FILE_TYPE_LINE type of list """ if not file_path: return False file_path = os.path.abspath(file_path) if not path.create_dir(os.path.dirname(file_path)): return False if append_type == WRITE_FILE_TYPE_APPEND: open_type = "a" elif append_type == WRITE_FILE_TYPE_REPLACE: open_type = "w" else: return False with open(file_path, open_type, encoding=encoding) as file_handle: file_handle.write(msg + "\n")
def write_file(msg, file_path, append_type=WRITE_FILE_TYPE_APPEND): """Write local file :param file_path: the path of file :param append_type: WRITE_FILE_TYPE_APPEND "a" mode to write file WRITE_FILE_TYPE_REPLACE "w" mode to write file :return: READ_FILE_TYPE_FULL type of string READ_FILE_TYPE_LINE type of list """ file_path = os.path.abspath(file_path) if not path.create_dir(os.path.dirname(file_path)): return False if append_type == WRITE_FILE_TYPE_APPEND: open_type = "a" elif append_type == WRITE_FILE_TYPE_REPLACE: open_type = "w" else: return False with open(file_path, open_type, encoding="UTF-8") as file_handle: file_handle.write(msg + "\n")
def sort_file(source_path: str, destination_path: str, start_count: int, file_name_length: int): """ 将指定文件夹内的所有文件排序重命名并复制到其他文件夹中 :Args: - source_path - 待排序文件所在目录 - destination_path - 排序后所复制的目录 - start_count - 重命名开始的序号 - file_name_length - 复制后的文件名长度 """ file_list = path.get_dir_files_name(source_path, path.RETURN_FILE_LIST_DESC) # 判断排序目标文件夹是否存在 if len(file_list) >= 1: if not path.create_dir(destination_path): return False # 倒叙排列 for file_name in file_list: start_count += 1 file_extension = os.path.splitext(file_name)[1] # 包括 .扩展名 new_file_name = str(("%0" + str(file_name_length) + "d") % start_count) + file_extension path.copy_file(os.path.join(source_path, file_name), os.path.join(destination_path, new_file_name)) # 删除临时文件夹 path.delete_dir_or_file(source_path) return True
def start_download(self): """ 主体下载逻辑 """ # 默认读取配置 if not isinstance(self.replace_if_exist, bool): self.replace_if_exist = DOWNLOAD_REPLACE_IF_EXIST # 同名文件已经存在,直接返回 if not self.replace_if_exist and os.path.exists( self.file_path) and os.path.getsize(self.file_path) > 0: output.print_msg(f"文件{self.file_path}({self.file_url})已存在,跳过") self.status = self.DOWNLOAD_SUCCEED return # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(self.file_path)): self.code = self.CODE_FILE_CREATE_FAILED return # 是否需要分段下载 self.check_auto_multipart_download() # 下载 for retry_count in range(0, NET_CONFIG["DOWNLOAD_RETRY_COUNT"]): if EXIT_FLAG: self.code = self.CODE_PROCESS_EXIT break if not self.is_multipart_download: # 单线程下载 if not self.single_download(): continue else: # 分段下载 if not self.multipart_download(): continue # 如果没有返回文件的长度,直接下载成功 if self.content_length == 0: self.status = self.DOWNLOAD_SUCCEED self.code = 0 return # 判断文件下载后的大小和response中的Content-Length是否一致 file_size = os.path.getsize(self.file_path) if self.content_length == file_size: self.status = self.DOWNLOAD_SUCCEED self.code = 0 return else: self.code = self.CODE_FILE_SIZE_INVALID output.print_msg( f"本地文件{self.file_path}:{self.content_length}和网络文件{self.file_url}:{file_size}不一致" ) time.sleep(NET_CONFIG["HTTP_REQUEST_RETRY_WAIT_TIME"]) # 删除可能出现的临时文件 path.delete_dir_or_file(self.file_path)
def save_net_file_list(file_url_list, file_path, header_list=None, cookies_list=None): """Visit web and save to local(multiple remote resource, single local file) :param file_url_list: the list of remote resource URL which you want to save :param file_path: the local file path which you want to save remote resource :param header_list: customize header dictionary :param cookies_list: customize cookies dictionary, will replaced header_list["Cookie"] :return: status 0 download failure, 1 download successful code failure reason """ # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(file_path)): return False for retry_count in range(0, NET_CONFIG["DOWNLOAD_RETRY_COUNT"]): # 下载 with open(file_path, "wb") as file_handle: for file_url in file_url_list: response = http_request( file_url, header_list=header_list, cookies_list=cookies_list, connection_timeout=NET_CONFIG[ "DOWNLOAD_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["DOWNLOAD_READ_TIMEOUT"]) if response.status == HTTP_RETURN_CODE_SUCCEED: file_handle.write(response.data) # 超过重试次数,直接退出 elif response.status == HTTP_RETURN_CODE_RETRY: file_handle.close() path.delete_dir_or_file(file_path) return {"status": 0, "code": -2} # 其他http code,退出 else: file_handle.close() path.delete_dir_or_file(file_path) return {"status": 0, "code": response.status} return {"status": 1, "code": 0} # path.delete_dir_or_file(file_path) return {"status": 0, "code": -2}
def write_file(msg, file_path, append_type=WRITE_FILE_TYPE_APPEND): file_path = path.change_path_encoding(file_path) if not path.create_dir(os.path.dirname(file_path)): return False if append_type == WRITE_FILE_TYPE_APPEND: open_type = "a" elif append_type == WRITE_FILE_TYPE_REPLACE: open_type = "w" else: return False with open(file_path, open_type) as file_handle: if isinstance(msg, unicode): msg = msg.encode("UTF-8") file_handle.write(msg + "\n")
def sort_file(source_path, destination_path, start_count, file_name_length): file_list = path.get_dir_files_name(source_path, path.RETURN_FILE_LIST_DESC) # 判断排序目标文件夹是否存在 if len(file_list) >= 1: if not path.create_dir(destination_path): return False # 倒叙排列 for file_name in file_list: start_count += 1 file_type = os.path.splitext(file_name)[1] # 包括 .扩展名 new_file_name = str(("%0" + str(file_name_length) + "d") % start_count) + file_type path.copy_files(os.path.join(source_path, file_name), os.path.join(destination_path, new_file_name)) # 删除临时文件夹 path.delete_dir_or_file(source_path) return True
"</div>").strip() item_introduction = item_introduction.replace("'", "’") output.print_msg("%s %s %s %s" % (item_position, item_name, special_attribute, item_introduction)) item_attribute_list[item_path].append( [item_name, special_attribute, item_introduction]) else: output.print_msg("error get" + item_url) else: output.print_msg("error get" + item_index_url) pagination = tool.find_sub_string(item_index_response.data, '<ul class="ui-pagination">', "</ul>") if pagination: pagination = re.findall('<a href="#page=([\d]*)">', pagination) max_page = 1 for page in pagination: max_page = max(max_page, int(page)) if page_count < max_page: page_count += 1 continue break path.create_dir("data") for item_path in item_attribute_list: with open(path.change_path_encoding("data\%s.txt" % item_list[item_path]), "w") as file_handle: for item in item_attribute_list[item_path]: file_handle.write("\t".join(item) + "\n")
def save_net_file(file_url, file_path, need_content_type=False, header_list=None, cookies_list=None, head_check=True): """Visit web and save to local :param file_url: the remote resource URL which you want to save :param file_path: the local file path which you want to save remote resource :param need_content_type: is auto rename file according to "Content-Type" in response headers :param header_list: customize header dictionary :param cookies_list: customize cookies dictionary, will replaced header_list["Cookie"] :param head_check: "HEAD" method request to check response status and file size before download file :return: status 0 download failure, 1 download successful code failure reason file_path finally local file path(when need_content_type is True, will rename it) """ file_path = path.change_path_encoding(file_path) # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(file_path)): return False create_file = False for retry_count in range(0, HTTP_DOWNLOAD_RETRY_COUNT): if head_check: request_method = "HEAD" else: request_method = "GET" # 获取头信息 response = http_request(file_url, request_method, header_list=header_list, cookies_list=cookies_list, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_READ_TIMEOUT) if response.status == HTTP_RETURN_CODE_SUCCEED: # todo 分段下载 # 判断文件是不是过大 content_length = response.getheader("Content-Length") if content_length is not None and int(content_length) > HTTP_DOWNLOAD_MAX_SIZE: return {"status": 0, "code": -4} # response中的Content-Type作为文件后缀名 if need_content_type: content_type = response.getheader("Content-Type") if content_type is not None and content_type != "octet-stream": if content_type == "video/quicktime": new_file_type = "mov" else: new_file_type = content_type.split("/")[-1] file_path = os.path.splitext(file_path)[0] + "." + new_file_type # 如果是先调用HEAD方法的,需要重新获取完整数据 if head_check: response = http_request(file_url, method="GET", header_list=header_list, cookies_list=cookies_list, connection_timeout=HTTP_DOWNLOAD_CONNECTION_TIMEOUT, read_timeout=HTTP_DOWNLOAD_READ_TIMEOUT) if response.status != HTTP_RETURN_CODE_SUCCEED: continue # 下载 with open(file_path, "wb") as file_handle: file_handle.write(response.data) create_file = True # 判断文件下载后的大小和response中的Content-Length是否一致 if content_length is None: return {"status": 1, "code": 0, "file_path": file_path} file_size = os.path.getsize(file_path) if int(content_length) == file_size: return {"status": 1, "code": 0, "file_path": file_path} else: output.print_msg("本地文件%s:%s和网络文件%s:%s不一致" % (file_path.encode("UTF-8"), content_length, str(file_url), file_size)) elif response.status == HTTP_RETURN_CODE_URL_INVALID: if create_file: path.delete_dir_or_file(file_path) return {"status": 0, "code": -1} # 超过重试次数,直接退出 elif response.status == HTTP_RETURN_CODE_RETRY: if create_file: path.delete_dir_or_file(file_path) return {"status": 0, "code": -2} # 其他http code,退出 else: if create_file: path.delete_dir_or_file(file_path) return {"status": 0, "code": response.status} if create_file: path.delete_dir_or_file(file_path) return {"status": 0, "code": -3}
def save_net_file(file_url, file_path, need_content_type=False, head_check=False, **kwargs): """Visit web and save to local :param file_url: the remote resource URL which you want to save :param file_path: the local file path which you want to save remote resource :param need_content_type: is auto rename file according to "Content-Type" in response headers :param head_check: "HEAD" method request to check response status and file size before download file :return: status 0 download failure, 1 download successful code failure reason file_path finally local file path(when need_content_type is True, will rename it) """ # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(file_path)): return False is_create_file = False is_multi_thread = False return_code = {"status": 0, "code": -3} for retry_count in range(0, NET_CONFIG["DOWNLOAD_RETRY_COUNT"]): if head_check and retry_count == 0: request_method = "HEAD" else: request_method = "GET" # 获取头信息 response = http_request(file_url, request_method, connection_timeout=NET_CONFIG["HTTP_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["HTTP_READ_TIMEOUT"], **kwargs) # 其他返回状态,退出 if response.status != HTTP_RETURN_CODE_SUCCEED: # URL格式不正确 if response.status == HTTP_RETURN_CODE_URL_INVALID: return_code = {"status": 0, "code": -1} # 超过重试次数 elif response.status == HTTP_RETURN_CODE_RETRY: return_code = {"status": 0, "code": -2} # 其他http code else: return_code = {"status": 0, "code": response.status} break # 判断文件是不是过大 content_length = response.getheader("Content-Length") if content_length is not None: content_length = int(content_length) # 超过限制 if content_length > NET_CONFIG["DOWNLOAD_LIMIT_SIZE"]: return {"status": 0, "code": -4} # 文件比较大,使用多线程下载(必须是head_check=True的情况下,否则整个文件内容都已经返回了) elif head_check and content_length > NET_CONFIG["DOWNLOAD_MULTI_THREAD_MIN_SIZE"]: is_multi_thread = True # response中的Content-Type作为文件后缀名 if need_content_type: content_type = response.getheader("Content-Type") if content_type is not None and content_type != "octet-stream": global MIME_DICTIONARY if MIME_DICTIONARY is None: MIME_DICTIONARY = tool.json_decode(file.read_file(os.path.join(os.path.dirname(__file__), "mime.json")), {}) if content_type in MIME_DICTIONARY: new_file_type = MIME_DICTIONARY[content_type] else: new_file_type = content_type.split("/")[-1] file_path = os.path.splitext(file_path)[0] + "." + new_file_type if not is_multi_thread: # 单线程下载 # 如果是先调用HEAD方法的,需要重新获取完整数据 if head_check: response = http_request(file_url, method="GET", connection_timeout=NET_CONFIG["DOWNLOAD_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["DOWNLOAD_READ_TIMEOUT"], **kwargs) if response.status != HTTP_RETURN_CODE_SUCCEED: continue # 下载 with open(file_path, "wb") as file_handle: is_create_file = True try: file_handle.write(response.data) except OSError as ose: if str(ose).find("No space left on device"): global EXIT_FLAG EXIT_FLAG = True raise else: # 多线程下载 # 单线程下载文件大小(100MB) multi_thread_block_size = int(math.ceil(content_length / 10 / SIZE_MB)) * SIZE_MB multi_thread_block_size = min(NET_CONFIG["DOWNLOAD_MULTI_THREAD_MIN_BLOCK_SIZE"], max(NET_CONFIG["DOWNLOAD_MULTI_THREAD_MAX_BLOCK_SIZE"], multi_thread_block_size)) # 创建文件 with open(file_path, "w"): is_create_file = True thread_list = [] error_flag = [] with open(file_path, "rb+") as file_handle: file_no = file_handle.fileno() end_pos = -1 while end_pos < content_length - 1: start_pos = end_pos + 1 end_pos = min(content_length - 1, start_pos + multi_thread_block_size - 1) # 创建一个副本 fd_handle = os.fdopen(os.dup(file_no), "rb+", -1) thread = MultiThreadDownload(file_url, start_pos, end_pos, fd_handle, error_flag) thread.start() thread_list.append(thread) # 等待所有线程下载完毕 for thread in thread_list: thread.join() # 有任意一个线程下载失败了,或者文件存在连续1K以上的空字节 if len(error_flag) > 0: continue if not _check_multi_thread_download_file(file_path): output.print_msg("网络文件%s多线程下载后发现无效字节" % file_url) continue if content_length is None: return {"status": 1, "code": 0, "file_path": file_path} # 判断文件下载后的大小和response中的Content-Length是否一致 file_size = os.path.getsize(file_path) if content_length == file_size: return {"status": 1, "code": 0, "file_path": file_path} else: output.print_msg("本地文件%s:%s和网络文件%s:%s不一致" % (file_path, content_length, file_url, file_size)) time.sleep(10) if is_create_file: path.delete_dir_or_file(file_path) return return_code
def __init__(self, sys_config, extra_config=None): self.start_time = time.time() # 程序启动配置 if not isinstance(sys_config, dict): self.print_msg("程序启动配置不存在,请检查代码!") tool.process_exit() return sys_download_image = SYS_DOWNLOAD_IMAGE in sys_config sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config sys_set_proxy = SYS_SET_PROXY in sys_config sys_get_cookie = SYS_GET_COOKIE in sys_config sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config sys_not_download = SYS_NOT_DOWNLOAD in sys_config # exe程序 if tool.IS_EXECUTABLE: application_path = os.path.dirname(sys.executable) os.chdir(application_path) config_path = os.path.join(os.getcwd(), "data/config.ini") else: config_path = tool.PROJECT_CONFIG_PATH # 程序配置 config = read_config(config_path) # 应用配置 app_config_path = os.path.abspath( os.path.join(tool.PROJECT_APP_PATH, "app.ini")) if os.path.exists(app_config_path): config.update(read_config(app_config_path)) # 额外配置 if isinstance(extra_config, dict): config.update(extra_config) # 应用配置 self.app_config = {} if SYS_APP_CONFIG in sys_config and len( sys_config[SYS_APP_CONFIG]) > 0: for app_config_template in sys_config[SYS_APP_CONFIG]: if len(app_config_template) == 3: self.app_config[app_config_template[0]] = analysis_config( config, app_config_template[0], app_config_template[1], app_config_template[2]) # 日志 log.IS_SHOW_ERROR = self.is_show_error = analysis_config( config, "IS_SHOW_ERROR", True, CONFIG_ANALYSIS_MODE_BOOLEAN) log.IS_SHOW_STEP = self.is_show_step = analysis_config( config, "IS_SHOW_STEP", True, CONFIG_ANALYSIS_MODE_BOOLEAN) log.IS_SHOW_TRACE = self.is_show_trace = analysis_config( config, "IS_SHOW_TRACE", False, CONFIG_ANALYSIS_MODE_BOOLEAN) error_log_path = analysis_config(config, "ERROR_LOG_PATH", "\\log/errorLog.txt", CONFIG_ANALYSIS_MODE_PATH) log.ERROR_LOG_PATH = self.error_log_path = replace_path(error_log_path) error_log_dir = os.path.dirname(self.error_log_path) if not path.create_dir(error_log_dir): self.print_msg("创建错误日志目录 %s 失败" % error_log_dir) tool.process_exit() return is_log_step = analysis_config(config, "IS_LOG_STEP", True, CONFIG_ANALYSIS_MODE_BOOLEAN) if not is_log_step: log.STEP_LOG_PATH = self.step_log_path = "" else: step_log_path = analysis_config(config, "STEP_LOG_PATH", "\\log/stepLog.txt", CONFIG_ANALYSIS_MODE_PATH) log.STEP_LOG_PATH = self.step_log_path = replace_path( step_log_path) # 日志文件保存目录 step_log_dir = os.path.dirname(self.step_log_path) if not path.create_dir(step_log_dir): self.print_msg("创建步骤日志目录 %s 失败" % step_log_dir) tool.process_exit() return is_log_trace = analysis_config(config, "IS_LOG_TRACE", True, CONFIG_ANALYSIS_MODE_BOOLEAN) if not is_log_trace: log.TRACE_LOG_PATH = self.trace_log_path = "" else: trace_log_path = analysis_config(config, "TRACE_LOG_PATH", "\\log/traceLog.txt", CONFIG_ANALYSIS_MODE_PATH) log.TRACE_LOG_PATH = self.trace_log_path = replace_path( trace_log_path) # 日志文件保存目录 trace_log_dir = os.path.dirname(self.trace_log_path) if not path.create_dir(trace_log_dir): self.print_msg("创建调试日志目录 %s 失败" % trace_log_dir) tool.process_exit() return # 是否下载 self.is_download_image = analysis_config( config, "IS_DOWNLOAD_IMAGE", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_image self.is_download_video = analysis_config( config, "IS_DOWNLOAD_VIDEO", True, CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_video if not sys_not_download and not self.is_download_image and not self.is_download_video: if sys_download_image or sys_download_video: self.print_msg("所有支持的下载都没有开启,请检查配置!") tool.process_exit() return # 存档 self.save_data_path = analysis_config(config, "SAVE_DATA_PATH", "\\\\info/save.data", CONFIG_ANALYSIS_MODE_PATH) if not sys_not_check_save_data and not os.path.exists( self.save_data_path): # 存档文件不存在 self.print_msg("存档文件%s不存在!" % self.save_data_path) tool.process_exit() return temp_file_name = time.strftime( "%m-%d_%H_%M_", time.localtime(time.time())) + os.path.basename( self.save_data_path) self.temp_save_data_path = os.path.join( os.path.dirname(self.save_data_path), temp_file_name) if os.path.exists(self.temp_save_data_path): # 临时文件已存在 self.print_msg("存档临时文件%s已存在!" % self.temp_save_data_path) tool.process_exit() return # session self.session_data_path = analysis_config(config, "SESSION_DATA_PATH", "\\\\info/session.data", CONFIG_ANALYSIS_MODE_PATH) # 是否需要下载图片 if self.is_download_image: # 图片保存目录 self.image_download_path = analysis_config( config, "IMAGE_DOWNLOAD_PATH", "\\\\photo", CONFIG_ANALYSIS_MODE_PATH) if not path.create_dir(self.image_download_path): # 图片保存目录创建失败 self.print_msg("图片保存目录%s创建失败!" % self.image_download_path) tool.process_exit() return else: self.image_download_path = "" # 是否需要下载视频 if self.is_download_video: # 视频保存目录 self.video_download_path = analysis_config( config, "VIDEO_DOWNLOAD_PATH", "\\\\video", CONFIG_ANALYSIS_MODE_PATH) if not path.create_dir(self.video_download_path): # 视频保存目录创建失败 self.print_msg("视频保存目录%s创建失败!" % self.video_download_path) tool.process_exit() return else: self.video_download_path = "" # 代理 is_proxy = analysis_config(config, "IS_PROXY", 2, CONFIG_ANALYSIS_MODE_INTEGER) if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy): proxy_ip = analysis_config(config, "PROXY_IP", "127.0.0.1") proxy_port = analysis_config(config, "PROXY_PORT", "8087") # 使用代理的线程池 net.set_proxy(proxy_ip, proxy_port) else: # 初始化urllib3的线程池 net.init_http_connection_pool() # cookies self.cookie_value = {} if sys_get_cookie: # 操作系统&浏览器 browser_type = analysis_config(config, "BROWSER_TYPE", 2, CONFIG_ANALYSIS_MODE_INTEGER) # cookie is_auto_get_cookie = analysis_config(config, "IS_AUTO_GET_COOKIE", True, CONFIG_ANALYSIS_MODE_BOOLEAN) if is_auto_get_cookie: cookie_path = browser.get_default_browser_cookie_path( browser_type) else: cookie_path = analysis_config(config, "COOKIE_PATH", "") all_cookie_from_browser = browser.get_all_cookie_from_browser( browser_type, cookie_path) for cookie_domain in sys_config[SYS_GET_COOKIE]: # 如果指定了cookie key if sys_config[SYS_GET_COOKIE][cookie_domain]: for cookie_key in sys_config[SYS_GET_COOKIE][ cookie_domain]: self.cookie_value[cookie_key] = "" if cookie_domain in all_cookie_from_browser: for cookie_name in self.cookie_value: if cookie_name in all_cookie_from_browser[ cookie_domain]: self.cookie_value[ cookie_name] = all_cookie_from_browser[ cookie_domain][cookie_name] # 没有指定cookie key那么就是取全部 else: if cookie_domain in all_cookie_from_browser: for cookie_name in all_cookie_from_browser[ cookie_domain]: self.cookie_value[ cookie_name] = all_cookie_from_browser[ cookie_domain][cookie_name] # Http Setting net.HTTP_CONNECTION_TIMEOUT = analysis_config( config, "HTTP_CONNECTION_TIMEOUT", 10, CONFIG_ANALYSIS_MODE_INTEGER) net.HTTP_READ_TIMEOUT = analysis_config(config, "HTTP_READ_TIMEOUT", 10, CONFIG_ANALYSIS_MODE_INTEGER) net.HTTP_DOWNLOAD_CONNECTION_TIMEOUT = analysis_config( config, "HTTP_DOWLOAD_CONNECTION_TIMEOUT", 10, CONFIG_ANALYSIS_MODE_INTEGER) net.HTTP_DOWNLOAD_READ_TIMEOUT = analysis_config( config, "HTTP_DOWLOAD_READ_TIMEOUT", 60, CONFIG_ANALYSIS_MODE_INTEGER) net.HTTP_REQUEST_RETRY_COUNT = analysis_config( config, "HTTP_REQUEST_RETRY_COUNT", 10, CONFIG_ANALYSIS_MODE_INTEGER) # 线程数 self.thread_count = analysis_config(config, "THREAD_COUNT", 10, CONFIG_ANALYSIS_MODE_INTEGER) self.thread_lock = threading.Lock() # 线程锁,避免操作一些全局参数 self.thread_condition = threading.Condition( ) # 线程数达到上限时等待wait(),直到任意线程唤醒notify() # 启用线程监控是否需要暂停其他下载线程 if analysis_config(config, "IS_PORT_LISTENER_ENVET", True, CONFIG_ANALYSIS_MODE_BOOLEAN): listener_event_bind = {} # 暂停进程 listener_event_bind[str( portListenerEvent.PROCESS_STATUS_PAUSE)] = net.pause_request # 继续进程 listener_event_bind[str( portListenerEvent.PROCESS_STATUS_RUN)] = net.resume_request # 结束进程(取消当前的线程,完成任务) listener_event_bind[str( portListenerEvent.PROCESS_STATUS_STOP)] = self.stop_process listener_port = analysis_config(config, "LISTENER_PORT", 12345, CONFIG_ANALYSIS_MODE_INTEGER) process_control_thread = portListenerEvent.PortListenerEvent( port=listener_port, event_list=listener_event_bind) process_control_thread.setDaemon(True) process_control_thread.start() # 键盘监控线程(仅支持windows) if platform.system() == "Windows" and analysis_config( config, "IS_KEYBOARD_EVENT", True, CONFIG_ANALYSIS_MODE_BOOLEAN): keyboard_event_bind = {} pause_process_key = analysis_config(config, "PAUSE_PROCESS_KEYBOARD_KEY", "F9") # 暂停进程 if pause_process_key: keyboard_event_bind[pause_process_key] = net.pause_request # 继续进程 continue_process_key = analysis_config( config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10") if continue_process_key: keyboard_event_bind[continue_process_key] = net.resume_request # 结束进程(取消当前的线程,完成任务) stop_process_key = analysis_config(config, "STOP_PROCESS_KEYBOARD_KEY", "CTRL + F12") if stop_process_key: keyboard_event_bind[stop_process_key] = self.stop_process if keyboard_event_bind: keyboard_control_thread = keyboardEvent.KeyboardEvent( keyboard_event_bind) keyboard_control_thread.setDaemon(True) keyboard_control_thread.start() self.total_image_count = 0 self.total_video_count = 0 self.print_msg("初始化完成")
def download(file_url, file_path, recheck_file_extension=False, head_check=False, replace_if_exist: Optional[bool] = None, **kwargs): """ 现在远程文件到本地 :Args: - file_url - the remote resource URL which you want to save - file_path - the local file path which you want to save remote resource - recheck_file_extension - is auto rename file according to "Content-Type" in response headers - head_check -"HEAD" method request to check response status and file size before download file :Returns: - status - 0 download failure, 1 download successful - code - failure reason - file_path - finally local file path(when recheck_file_extension is True, will rename it) """ if not isinstance(replace_if_exist, bool): replace_if_exist = net.DOWNLOAD_REPLACE_IF_EXIST if not replace_if_exist and os.path.exists( file_path) and os.path.getsize(file_path) > 0: output.print_msg(f"文件{file_path}({file_url})已存在,跳过") return {"status": 1, "code": 0, "file_path": file_path} # 判断保存目录是否存在 if not path.create_dir(os.path.dirname(file_path)): return {"status": 1, "code": -11, "file_path": file_path} is_create_file = False is_multi_thread = False return_code = {"status": 0, "code": -3} for retry_count in range(0, net.NET_CONFIG["DOWNLOAD_RETRY_COUNT"]): if head_check and retry_count == 0: request_method = "HEAD" else: request_method = "GET" # 获取头信息 response = net.request( file_url, request_method, is_check_qps=False, connection_timeout=net.NET_CONFIG["HTTP_CONNECTION_TIMEOUT"], read_timeout=net.NET_CONFIG["HTTP_READ_TIMEOUT"], **kwargs) # 其他返回状态,退出 if response.status != net.HTTP_RETURN_CODE_SUCCEED: # URL格式不正确 if response.status == net.HTTP_RETURN_CODE_URL_INVALID: return_code = {"status": 0, "code": -1} # 超过重试次数 elif response.status == net.HTTP_RETURN_CODE_RETRY: return_code = {"status": 0, "code": -2} # 其他http code else: return_code = {"status": 0, "code": response.status} break # 判断文件是不是过大 content_length = response.getheader("Content-Length") if content_length is not None: content_length = int(content_length) # 超过限制 if content_length > NET_CONFIG["DOWNLOAD_LIMIT_SIZE"]: return {"status": 0, "code": -4} # 文件比较大,使用多线程下载(必须是head_check=True的情况下,否则整个文件内容都已经返回了) elif head_check and content_length > NET_CONFIG[ "DOWNLOAD_MULTI_THREAD_MIN_SIZE"]: is_multi_thread = True # response中的Content-Type作为文件后缀名 if recheck_file_extension: content_type = response.getheader("Content-Type") if content_type is not None and content_type != "octet-stream": if net.MIME_DICTIONARY is None: net.MIME_DICTIONARY = tool.json_decode( file.read_file( os.path.join(os.path.dirname(__file__), "mime.json")), {}) if content_type in net.MIME_DICTIONARY: new_file_extension = net.MIME_DICTIONARY[content_type] else: new_file_extension = content_type.split("/")[-1] file_path = os.path.splitext( file_path)[0] + "." + new_file_extension if not is_multi_thread: # 单线程下载 # 如果是先调用HEAD方法的,需要重新获取完整数据 if head_check: response = net.request( file_url, method="GET", connection_timeout=net. NET_CONFIG["DOWNLOAD_CONNECTION_TIMEOUT"], read_timeout=net.NET_CONFIG["DOWNLOAD_READ_TIMEOUT"], **kwargs) if response.status != net.HTTP_RETURN_CODE_SUCCEED: continue # 下载 with open(file_path, "wb") as file_handle: is_create_file = True try: file_handle.write(response.data) except OSError as ose: if str(ose).find("No space left on device") != -1: net.EXIT_FLAG = True raise else: # 多线程下载 # 创建文件 with open(file_path, "w"): is_create_file = True thread_list = [] error_flag = [] with open(file_path, "rb+") as file_handle: file_no = file_handle.fileno() end_pos = -1 while end_pos < content_length - 1: start_pos = end_pos + 1 end_pos = min( content_length - 1, start_pos + NET_CONFIG["DOWNLOAD_MULTI_THREAD_BLOCK_SIZE"] - 1) # 创建一个副本 fd_handle = os.fdopen(os.dup(file_no), "rb+", -1) thread = MultiThreadDownload(file_url, start_pos, end_pos, fd_handle, error_flag) thread.start() thread_list.append(thread) # 等待所有线程下载完毕 for thread in thread_list: thread.join() # 有任意一个线程下载失败了,或者文件存在连续1K以上的空字节 if len(error_flag) > 0: continue if not _check_multi_thread_download_file(file_path): output.print_msg(f"网络文件{file_url}多线程下载后发现无效字节") continue if content_length is None: return {"status": 1, "code": 0, "file_path": file_path} # 判断文件下载后的大小和response中的Content-Length是否一致 file_size = os.path.getsize(file_path) if content_length == file_size: return {"status": 1, "code": 0, "file_path": file_path} else: output.print_msg( f"本地文件{file_path}:{content_length}和网络文件{file_url}:{file_size}不一致" ) time.sleep(net.NET_CONFIG["HTTP_REQUEST_RETRY_WAIT_TIME"]) if is_create_file: path.delete_dir_or_file(file_path) return return_code