def get_config(config, key, default_value, mode): if config.has_option("setting", key): value = config.get("setting", key).encode("UTF-8") else: tool.print_msg("配置文件config.ini中没有找到key为'" + key + "'的参数,使用程序默认设置") value = default_value if mode == 0: pass elif mode == 1: if isinstance(value, int): pass elif isinstance(value, str) and value.isdigit(): value = int(value) else: tool.print_msg("配置文件config.ini中key为'" + key + "'的值必须是一个整数,使用程序默认设置") value = default_value elif mode == 2: if not value or value == "0" or (isinstance(value, str) and value.lower() == "false"): value = False else: value = True elif mode == 3: if value[:2] == "\\\\": # \\ 开头,程序所在目录 value = os.path.join(os.path.abspath(""), value[2:]) # \\ 仅做标记使用,实际需要去除 elif value[0] == "\\": # \ 开头,项目根目录(common目录上级) value = os.path.join(tool.PROJECT_ROOT_PATH, value[1:]) # \ 仅做标记使用,实际需要去除 value = os.path.realpath(value) return value
def get_config(config, key, default_value, mode): if config.has_option("setting", key): value = config.get("setting", key).encode("utf-8") else: tool.print_msg("配置文件config.ini中没有找到key为'" + key + "'的参数,使用程序默认设置") value = default_value if mode == 0: pass elif mode == 1: if isinstance(value, int): pass elif isinstance(value, str) and value.isdigit(): value = int(value) else: tool.print_msg("配置文件config.ini中key为'" + key + "'的值必须是一个整数,使用程序默认设置") value = default_value elif mode == 2: if not value or value == "0" or (isinstance(value, str) and value.lower() == "false"): value = False else: value = True elif mode == 3: if value[0] == "\\": value = os.path.join(os.path.abspath(""), value[1:]) # 第一个 \ 仅做标记使用,实际需要去除 value = os.path.realpath(value) return value
def __init__(self): sys_config = { robot.SYS_DOWNLOAD_IMAGE: True, robot.SYS_NOT_CHECK_SAVE_DATA: True, } robot.Robot.__init__(self, sys_config) tool.print_msg("配置文件读取完成")
def trace(msg): msg = tool.get_time() + " " + str(msg) if IS_SHOW_TRACE: tool.print_msg(msg, False) if TRACE_LOG_PATH != "": thread_lock.acquire() try: tool.write_file(msg, TRACE_LOG_PATH) except: raise finally: thread_lock.release()
def step(msg): msg = tool.get_time() + " " + str(msg) if IS_SHOW_STEP: tool.print_msg(msg, False) if STEP_LOG_PATH != "": thread_lock.acquire() try: tool.write_file(msg, STEP_LOG_PATH) except: raise finally: thread_lock.release()
def error(msg): msg = tool.get_time() + " [Error] " + str(msg) if IS_SHOW_ERROR: tool.print_msg(msg, False) if ERROR_LOG_PATH != "": thread_lock.acquire() try: tool.write_file(msg, ERROR_LOG_PATH) except: raise finally: thread_lock.release()
def get_member_list(): index_url = "http://blog.nogizaka46.com/" index_return_code, index_page = tool.http_request(index_url)[:2] if index_return_code: member_list_find = re.findall( '<div class="unit"><a href="./([^"]*)"><img src="[^>]*alt="([^"]*)" />', index_page ) for member_info in member_list_find: tool.print_msg("%s\t\t\t%s" % (member_info[0], member_info[1].replace(" ", "")), False) if len(member_list_find) > 0: tool.print_msg("复制以上内容到save.data中,删除不需要的行,即可开始运行", False) return None
def get_member_list(): index_url = "http://www.keyakizaka46.com/mob/news/diarShw.php?cd=member" index_return_code, index_page = tool.http_request(index_url)[:2] if index_return_code: member_list_data = tool.find_sub_string(index_page, '<ul class="thumb">', "</ul>") if member_list_data: member_list_find = re.findall("<li ([\S|\s]*?)</li>", member_list_data) for member_info in member_list_find: ct = tool.find_sub_string(member_info, "&ct=", '">') name = tool.find_sub_string(member_info, '<p class="name">', "</p>").strip().replace(" ", "") tool.print_msg("%s\t\t\t%s" % (ct, name), False) if len(member_list_find) > 0: tool.print_msg("复制以上内容到save.data中,删除不需要的行,即可开始运行", False) return None
def save_net_file(file_url, file_path, need_content_type=False, header_list=None, cookies_list=None): file_path = tool.change_path_encoding(file_path) # 判断保存目录是否存在 if not tool.make_dir(os.path.dirname(file_path), 0): return False create_file = False for retry_count in range(0, 5): response = http_request(file_url, header_list=header_list, cookies_list=cookies_list, read_timeout=60) if response.status == HTTP_RETURN_CODE_SUCCEED: # response中的Content-Type作为文件后缀名 if need_content_type: content_type = response.getheader("Content-Type") if content_type is not None and content_type != "octet-stream": file_path = os.path.splitext(file_path)[0] + "." + content_type.split("/")[-1] # 下载 with open(file_path, "wb") as file_handle: file_handle.write(response.data) create_file = True # 判断文件下载后的大小和response中的Content-Length是否一致 content_length = response.getheader("Content-Length") if content_length is None: return {"status": 1, "code": 0, "file_path": file_path} file_size = os.path.getsize(file_path) if int(content_length) == file_size: return {"status": 1, "code": 0, "file_path": file_path} else: tool.print_msg("本地文件%s:%s和网络文件%s:%s不一致" % (file_path, content_length, file_url, file_size)) elif response.status == HTTP_RETURN_CODE_URL_INVALID: if create_file: os.remove(file_path) return {"status": 0, "code": -1} # 超过重试次数,直接退出 elif response.status == HTTP_RETURN_CODE_RETRY: if create_file: os.remove(file_path) return {"status": 0, "code": -2} # 500锡类错误,重试 elif response.status in [500, 502, 503, 504]: pass # 其他http code,退出 else: if create_file: os.remove(file_path) return {"status": 0, "code": response.status} if create_file: os.remove(file_path) return {"status": 0, "code": -3}
def print_msg(self, msg): if self.print_function is None: tool.print_msg(msg, True) else: self.print_function(msg)
def http_request(url, method="GET", post_data=None, binary_data=None, header_list=None, cookies_list=None, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_CONNECTION_TIMEOUT, is_random_ip=True, json_decode=False, encode_multipart=False, redirect=True, exception_return=""): if not (url.find("http://") == 0 or url.find("https://") == 0): return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) method = method.upper() if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]: return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) if HTTP_CONNECTION_POOL is None: init_http_connection_pool() retry_count = 0 while True: while process.PROCESS_STATUS == process.PROCESS_STATUS_PAUSE: time.sleep(10) if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP: tool.process_exit(0) if header_list is None: header_list = {} # 设置User-Agent if "User-Agent" not in header_list: header_list["User-Agent"] = _random_user_agent() # 设置一个随机IP if is_random_ip: random_ip = _random_ip_address() header_list["X-Forwarded-For"] = random_ip header_list["X-Real-Ip"] = random_ip # 设置cookie if cookies_list: header_list["Cookie"] = build_header_cookie_string(cookies_list) try: if connection_timeout == 0 and read_timeout == 0: timeout = None elif connection_timeout == 0: timeout = urllib3.Timeout(read=read_timeout) elif read_timeout == 0: timeout = urllib3.Timeout(connect=connection_timeout) else: timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout) if method == "POST": if binary_data is None: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, fields=post_data, encode_multipart=encode_multipart) else: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart) else: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout) if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode: try: response.json_data = json.loads(response.data) except ValueError: is_error = True content_type = response.getheader("Content-Type") if content_type is not None: charset = tool.find_sub_string(content_type, "charset=", None) if charset: if charset == "gb2312": charset = "GBK" try: response.json_data = json.loads(response.data.decode(charset)) except: pass else: is_error = False if is_error: response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR return response except urllib3.exceptions.ProxyError: notice = "无法访问代理服务器,请检查代理设置。检查完成后输入(C)ontinue继续程序或者(S)top退出程序:" input_str = tool.console_input(notice).lower() if input_str in ["c", "continue"]: pass elif input_str in ["s", "stop"]: tool.process_exit(0) except urllib3.exceptions.ReadTimeoutError: pass except urllib3.exceptions.ConnectTimeoutError, e: # 域名无法解析 if str(e).find("[Errno 11004] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) pass # except urllib3.exceptions.MaxRetryError, e: # print_msg(url) # print_msg(str(e)) # # 无限重定向 # # if str(e).find("Caused by ResponseError('too many redirects',)") >= 0: # # return ErrorResponse(-1) # except urllib3.exceptions.ConnectTimeoutError, e: # print_msg(str(e)) # print_msg(url + " 访问超时,稍后重试") # # 域名无法解析 # # if str(e).find("[Errno 11004] getaddrinfo failed") >= 0: # # return ErrorResponse(-2) # except urllib3.exceptions.ProtocolError, e: # print_msg(str(e)) # print_msg(url + " 访问超时,稍后重试") # # 链接被终止 # # if str(e).find("'Connection aborted.', error(10054,") >= 0: # # return ErrorResponse(-3) except Exception, e: if exception_return and str(e).find(exception_return) >= 0: return ErrorResponse(HTTP_RETURN_CODE_EXCEPTION_CATCH) elif str(e).find("EOF occurred in violation of protocol") >=0: time.sleep(30) tool.print_msg(str(e)) tool.print_msg(url + " 访问超时,稍后重试") traceback.print_exc()
def set_proxy(ip, port): global HTTP_CONNECTION_POOL HTTP_CONNECTION_POOL = urllib3.ProxyManager("http://%s:%s" % (ip, port), retries=False) tool.print_msg("设置代理成功")
# print_msg(url + " 访问超时,稍后重试") # # 链接被终止 # # if str(e).find("'Connection aborted.', error(10054,") >= 0: # # return ErrorResponse(-3) except Exception, e: if exception_return and str(e).find(exception_return) >= 0: return ErrorResponse(HTTP_RETURN_CODE_EXCEPTION_CATCH) elif str(e).find("EOF occurred in violation of protocol") >=0: time.sleep(30) tool.print_msg(str(e)) tool.print_msg(url + " 访问超时,稍后重试") traceback.print_exc() retry_count += 1 if retry_count >= HTTP_REQUEST_RETRY_COUNT: tool.print_msg("无法访问页面:" + url) return ErrorResponse(HTTP_RETURN_CODE_RETRY) # 随机生成一个合法的user agent def _random_user_agent(): # "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0" # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" firefox_version_max = 55 # https://zh.wikipedia.org/zh-cn/Google_Chrome chrome_version_list = ["51.0.2704", "52.0.2743", "53.0.2785", "54.0.2840", "55.0.2883", "56.0.2924", "57.0.2987", "58.0.3029", "59.0.3071", "60.0.3080"] windows_version_list = ["6.1", "6.3", "10.0"] browser_type = random.choice(["firefox", "chrome"]) os_type = random.choice(windows_version_list) if browser_type == "firefox":