def get_html(intervel): for i in range(2055, 1, -1): try: new_url = Proxy_url + str(i) + ".html" lm.log_info("正在获取" + new_url + "的信息") html = get_page_content(new_url,None) parse_html(html) time.sleep(intervel) except Exception as e: lm.log_error(e.args[0]) continue
def get_html(interval): page_num = parse_page_num(Proxy_url) for i in range(page_num): try: new_url = Proxy_url + str(i + 1) + ".html" lm.log_info("正在获取" + new_url + "的信息") html = get_page_content(new_url, None) parse_html(html) time.sleep(interval) except Exception as e: lm.log_error(e.args[0]) continue
def get_html(interval): for suf in suffix: url_joint = Proxy_url + "/" + suf for i in range(2000): try: new_url = url_joint + "/" + str(i + 1) lm.log_info("正在获取" + new_url + "的信息") html = get_page_content(new_url, None) parse_html(html) time.sleep(interval) except Exception as e: lm.log_error(e.args[0]) continue
def get_page_content(url, proxy): try: if proxy: r = requests.get(url, headers=headers, timeout=10, proxies=proxy) else: r = requests.get(url, headers=headers, timeout=10) if r.status_code == 200: lm.log_info("获取" + url + "的页面数据成功") return r.text else: lm.log_warning("获取" + url + "的页面数据失败 正在换代理获取重新获取。。。") proxy_list = { "http": gip.GetProxyIP(), } get_page_content(url,proxy_list) except: lm.log_error(url + "链接错误 取消此次链接") get_page_content(url, None)