def parse_html(html): doc = pq(html) info = doc(".cont").text() for i in info.split("\n"): result_dic = {"ip": i.split("@")[0].split(":")[0], "post": i.split("@")[0].split(":")[1]} result = json.dumps(result_dic) ro.setDic(rc.getRandomCode(), result) lm.log_info(str(result_dic) + "已经存到Redis中")
def parse_html(html): doc = pq(html) for i in doc("#ip_list .odd"): i_html = pq(i) result_dic = {"ip": i_html("td").eq(1).text(), "port": i_html("td").eq(2).text()} result = json.dumps(result_dic) ro.setDic(rc.getRandomCode(), result) lm.log_info(str(result_dic) + "已经存到Redis中")
def get_html(intervel): for i in range(2055, 1, -1): try: new_url = Proxy_url + str(i) + ".html" lm.log_info("正在获取" + new_url + "的信息") html = get_page_content(new_url,None) parse_html(html) time.sleep(intervel) except Exception as e: lm.log_error(e.args[0]) continue
def parse_html(html): doc = pq(html) for i in doc(".fl-table tbody tr"): i_html = pq(i) result_dic = { "ip": i_html("td").eq(0).text().split(":")[0], "port": i_html("td").eq(0).text().split(":")[1] } result = json.dumps(result_dic) ro.setDic(rc.getRandomCode(), result) lm.log_info(str(result_dic) + "已经存到Redis中")
def parse_html(html): doc = pq(html.encode('iso-8859-1').decode('gbk')) for i in doc("#main table tr"): i_html = pq(i) result_dic = { "ip": i_html("td").eq(0).text(), "port": i_html("td").eq(1).text() } if result_dic["ip"] != "ip": result = json.dumps(result_dic) ro.setDic(rc.getRandomCode(), result) lm.log_info(str(result_dic) + "已经存到Redis中")
def get_html(interval): page_num = parse_page_num(Proxy_url) for i in range(page_num): try: new_url = Proxy_url + str(i + 1) + ".html" lm.log_info("正在获取" + new_url + "的信息") html = get_page_content(new_url, None) parse_html(html) time.sleep(interval) except Exception as e: lm.log_error(e.args[0]) continue
def get_html(interval): for suf in suffix: url_joint = Proxy_url + "/" + suf for i in range(2000): try: new_url = url_joint + "/" + str(i + 1) lm.log_info("正在获取" + new_url + "的信息") html = get_page_content(new_url, None) parse_html(html) time.sleep(interval) except Exception as e: lm.log_error(e.args[0]) continue
def get_page_content(url, proxy): try: if proxy: r = requests.get(url, headers=headers, timeout=10, proxies=proxy) else: r = requests.get(url, headers=headers, timeout=10) if r.status_code == 200: lm.log_info("获取" + url + "的页面数据成功") return r.text else: lm.log_warning("获取" + url + "的页面数据失败 正在换代理获取重新获取。。。") proxy_list = { "http": gip.GetProxyIP(), } get_page_content(url,proxy_list) except: lm.log_error(url + "链接错误 取消此次链接") get_page_content(url, None)
def GetProxyIP(): # 获取redis对象 r = ro.getRedisObj() # 随机获取一条数据 i = random.sample(ro.getDicKeys(), 1)[0] if i: # 得到数据的value值,为字典类型的json字符串,将其转为字典类型 ip_info = json.loads(r.hget("proxy_ip", i)) try: # 测试代理是否可用 telnetlib.Telnet(ip_info["ip"], ip_info["post"], timeout=2) # 记录日志 lm.log_info(ip_info["ip"] + ":" + ip_info["post"] + "监测为可用代理") # 将可用代理写入桌面txt文件 with open(get_desktop() + "/可用代理.txt", 'a', encoding='utf-8') as f: f.write(ip_info["ip"] + " " + ip_info["post"] + "\r\n") except Exception as e: # 代理不可用,在redis中将其删除 r.delete(i) lm.log_info(ip_info["ip"] + ":" + ip_info["post"] + "监测为不可用代理,已删除")