Ejemplo n.º 1
0
def parse_html(html):
    doc = pq(html)
    info = doc(".cont").text()
    for i in info.split("\n"):
        result_dic = {"ip": i.split("@")[0].split(":")[0], "post": i.split("@")[0].split(":")[1]}
        result = json.dumps(result_dic)
        ro.setDic(rc.getRandomCode(), result)
        lm.log_info(str(result_dic) + "已经存到Redis中")
Ejemplo n.º 2
0
def parse_html(html):
    doc = pq(html)
    for i in doc("#ip_list .odd"):
        i_html = pq(i)
        result_dic = {"ip": i_html("td").eq(1).text(), "port": i_html("td").eq(2).text()}
        result = json.dumps(result_dic)
        ro.setDic(rc.getRandomCode(), result)
        lm.log_info(str(result_dic) + "已经存到Redis中")
Ejemplo n.º 3
0
def get_html(intervel):
    for i in range(2055, 1, -1):
        try:
            new_url = Proxy_url + str(i) + ".html"
            lm.log_info("正在获取" + new_url + "的信息")
            html = get_page_content(new_url,None)
            parse_html(html)
            time.sleep(intervel)
        except Exception as e:
            lm.log_error(e.args[0])
            continue
Ejemplo n.º 4
0
def parse_html(html):
    doc = pq(html)
    for i in doc(".fl-table tbody tr"):
        i_html = pq(i)
        result_dic = {
            "ip": i_html("td").eq(0).text().split(":")[0],
            "port": i_html("td").eq(0).text().split(":")[1]
        }
        result = json.dumps(result_dic)
        ro.setDic(rc.getRandomCode(), result)
        lm.log_info(str(result_dic) + "已经存到Redis中")
Ejemplo n.º 5
0
def parse_html(html):
    doc = pq(html.encode('iso-8859-1').decode('gbk'))
    for i in doc("#main table tr"):
        i_html = pq(i)
        result_dic = {
            "ip": i_html("td").eq(0).text(),
            "port": i_html("td").eq(1).text()
        }
        if result_dic["ip"] != "ip":
            result = json.dumps(result_dic)
            ro.setDic(rc.getRandomCode(), result)
            lm.log_info(str(result_dic) + "已经存到Redis中")
Ejemplo n.º 6
0
def get_html(interval):
    page_num = parse_page_num(Proxy_url)
    for i in range(page_num):
        try:
            new_url = Proxy_url + str(i + 1) + ".html"
            lm.log_info("正在获取" + new_url + "的信息")
            html = get_page_content(new_url, None)
            parse_html(html)
            time.sleep(interval)
        except Exception as e:
            lm.log_error(e.args[0])
            continue
Ejemplo n.º 7
0
def get_html(interval):
    for suf in suffix:
        url_joint = Proxy_url + "/" + suf
        for i in range(2000):
            try:
                new_url = url_joint + "/" + str(i + 1)
                lm.log_info("正在获取" + new_url + "的信息")
                html = get_page_content(new_url, None)
                parse_html(html)
                time.sleep(interval)
            except Exception as e:
                lm.log_error(e.args[0])
                continue
Ejemplo n.º 8
0
def get_page_content(url, proxy):
    try:
        if proxy:
            r = requests.get(url, headers=headers, timeout=10, proxies=proxy)
        else:
            r = requests.get(url, headers=headers, timeout=10)
        if r.status_code == 200:
            lm.log_info("获取" + url + "的页面数据成功")
            return r.text
        else:
            lm.log_warning("获取" + url + "的页面数据失败 正在换代理获取重新获取。。。")
            proxy_list = {
                "http": gip.GetProxyIP(),
            }
            get_page_content(url,proxy_list)
    except:
        lm.log_error(url + "链接错误 取消此次链接")
        get_page_content(url, None)
Ejemplo n.º 9
0
def GetProxyIP():
    # 获取redis对象
    r = ro.getRedisObj()
    # 随机获取一条数据
    i = random.sample(ro.getDicKeys(), 1)[0]
    if i:
        # 得到数据的value值,为字典类型的json字符串,将其转为字典类型
        ip_info = json.loads(r.hget("proxy_ip", i))
        try:
            # 测试代理是否可用
            telnetlib.Telnet(ip_info["ip"], ip_info["post"], timeout=2)
            # 记录日志
            lm.log_info(ip_info["ip"] + ":" + ip_info["post"] + "监测为可用代理")
            # 将可用代理写入桌面txt文件
            with open(get_desktop() + "/可用代理.txt", 'a', encoding='utf-8') as f:
                f.write(ip_info["ip"] + " " + ip_info["post"] + "\r\n")

        except Exception as e:
            # 代理不可用,在redis中将其删除
            r.delete(i)
            lm.log_info(ip_info["ip"] + ":" + ip_info["post"] + "监测为不可用代理,已删除")