Python IpProxyPool Examples

Programming Language: Python

Class/Type: IpProxyPool

Examples at hotexamples.com: 3

Python IpProxyPool - 3 examples found. These are the top rated real world Python examples of IpProxyPool extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get_proxy(3)

Frequently Used Methods

get_proxy (3)

Example #1

Show file

File: House information_0326.py Project: LingyunWan/work

def parse_list(base_url):

    # 加载代理池
    proxies = IpProxyPool.get_proxy()
    # 调用代理IP
    proxy = random.choice(proxies)
    print(proxy)

    req = requests.get(base_url, proxies=proxy)
    html = req.content.decode('gbk')
    html = etree.HTML(html)
    print(html)

    # 匹配规则不完整,需要组合,得到的str0是所需的整体匹配规则
    for i in range(30):
        if i < 9:
            lists = str(0) + str(i + 1)
            str0 = '//div[@class="houseList"]/dl[@id="list_D03_' + lists + '"]'
        else:
            str0 = '//div[@class="houseList"]/dl[@id="list_D03_' + str(
                i + 1) + '"]'

        # 第一个分匹配规则(匹配title)
        #str1 = str0 + '//p[@class="title"]//text()'

        house_description = html.xpath(str0 + '//p[@class="title"]//text()')[0]
        house_description = comma(house_description)

        # 第二个分匹配规则(匹配房屋信息)
        #str2 = str0 + '//p[@class="mt12"]//text()'

        house = html.xpath(str0 + '//p[@class="mt12"]//text()')
        print(house, len(house))

        if len(house) == 7:

            # 调用函数
            room_num = wrap(house[0])
            room_num = comma(room_num)
            house_floor = wrap(house[2])
            house_floor = comma(house_floor)
            house_orientation = wrap(house[4])
            house_orientation = comma(house_orientation)
            architectural_age = wrap(house[6]).split('：')[1]
            print(room_num, house_floor, house_orientation, architectural_age)

        elif len(house) == 5:

            if u'室' not in house[0]:
                room_num = u'无'
                house_floor = wrap(house[0])
                house_orientation = wrap(house[2])
                architectural_age = wrap(house[4]).split('：')[1]
                print(room_num, house_floor, house_orientation,
                      architectural_age)

            elif u'层' not in house[2]:
                room_num = wrap(house[0])
                house_floor = u'无'
                house_orientation = wrap(house[2])
                architectural_age = wrap(house[4]).split('：')[1]
                print(room_num, house_floor, house_orientation,
                      architectural_age)

            elif u'向' not in house[4]:
                room_num = wrap(house[0])
                house_floor = wrap(house[2])
                house_orientation = u'无'
                architectural_age = wrap(house[4]).split('：')[1]
                print(room_num, house_floor, house_orientation,
                      architectural_age)

            else:
                room_num = wrap(house[0])
                house_floor = wrap(house[2])
                house_orientation = wrap(house[4])
                architectural_age = u'无'
                print(room_num, house_floor, house_orientation,
                      architectural_age)

        else:
            room_num = wrap(house[0])
            house_floor = wrap(house[2])
            house_orientation = u'无'
            architectural_age = u'无'
            print(room_num, house_floor, house_orientation, architectural_age)

        # 第三个分匹配规则(匹配房源信息)
        #str3 = str0 + '//p[@class="mt10"]//text()'

        addr = html.xpath(str0 + '//p[@class="mt10"]//text()')

        print(addr, len(addr))

        if len(addr) == 5:
            community_name = addr[1]
            house_address = addr[3]
            print(community_name, house_address)
            # 获得房子所属片区
            house_area = house_address.split("-")[0]
            house_addr = house_address.split("-")[1]
            print(house_area, house_addr)

        # 第四个分匹配规则(匹配附近地铁站)
        #str4 = str0 + '//div[contains(@class,"mt8")]//text()'

        subway = html.xpath(str0 + '//div[contains(@class,"mt8")]//text()')
        print(subway, len(subway))

        if len(subway) == 8:
            subway_name = subway[4]
            print(subway_name)
        elif len(subway) == 7:
            if u'满' in subway[3]:
                subway_name = u'无'
                print(subway_name)
            else:
                subway_name = subway[3]
                print(subway_name)
        elif len(subway) == 6:
            if u'满' in subway[2]:
                subway_name = u'无'
                print(subway_name)
            else:
                subway_name = subway[2]
                print(subway_name)
        else:
            subway_name = u'无'
            print(subway_name)
        subway_name = comma(subway_name)

        # 第五个分匹配规则(匹配房子建筑面积)
        str5 = str0 + '//div[contains(@class,"area")]//text()'
        area = html.xpath(str5)
        print(area, len(area))

        if len(area) == 5:
            building_area = area[1].replace('㎡', '')
            print('%s:' % area[3], building_area)

        # 第六个分匹配规则(匹配房子价格)
        #str6 = str0 + '//div[@class="moreInfo"]//text()'

        price = html.xpath(str0 + '//div[@class="moreInfo"]//text()')
        print(price, len(price))

        if len(price) == 8:
            setof_price = price[1]
            avg_price = price[4].replace(u'元', '')
            print(setof_price, avg_price)
            # print(house_description, room_num, house_floor, house_orientation, architectural_age, community_name, house_area, house_addr, subway_name, building_area, setof_price, avg_price)
            with open('house_list1.csv', 'a+', encoding='gbk') as f:
                f.writelines(','.join([
                    house_description, room_num, house_floor,
                    house_orientation, architectural_age, community_name,
                    house_area, house_addr, subway_name, building_area,
                    setof_price, avg_price, '\n'
                ]))
                f.close()

Example #2

Show file

    return data


# 替换数据中的逗号
def comma(data):
    if '，' in data:
        data = data.replace('，', ' ')
    elif ',' in data:
        data = data.replace(',', ' ')
    else:
        data = data
    return data


# 加载代理池
proxies = IpProxyPool.get_proxy()


# def parse_list(base_url, mydb):
def parse_list(base_url):

    # 调用代理IP
    proxy = random.choice(proxies)
    print(proxy)
    req = requests.get(base_url, proxies=proxy)
    html = req.content.decode('gbk')
    html = etree.HTML(html)
    # print(html)
    # 匹配规则不完整,需要组合,得到的str0是所需的整体匹配规则
    for i in range(30):
        if i < 9:

Example #3

Show file

File: get_ip.py Project: LingyunWan/work

    def parse_list(self, ip_url='http://www.xicidaili.com/nn'):
        # 调用代理IP
        proxies = IpProxyPool.get_proxy()
        proxy = random.choice(proxies)
        print(proxy)
        # 访问西刺代理网站,来获取免费可用代理
        # req = requests.get(ip_url)
        headers = {
            'User-Agent': '%s' % self.user_agent_list,
        }
        # headers = random.choice(self.user_agent_list)
        # code = requests.get(ip_url, proxies=proxy, headers=headers).status_code
        # print(code)
        req = requests.get(ip_url, headers=headers, proxies=proxy)
        html = req.content.decode('utf-8')
        # print(html)
        tr_pattern = re.compile(r'<tr.*?>.*?</tr>', re.S)  # re.S让.可以匹配换行
        tr_list = tr_pattern.findall(html)[1:]

        # with open('get_ip_list', 'r+', encoding='utf-8') as f:
        for tr in tr_list:
            td_pattern = re.compile('<td>(.*?)</td>')
            info_pattern = re.compile(r'title="(.*?)".*?title="(.*?)"', re.S)
            td_list = td_pattern.findall(tr)
            info_list = info_pattern.findall(tr)
            info_list = info_list[0]

            speed = info_list[0]  # 获取连接速度
            speed = speed.replace('秒', '')

            contime = info_list[1]  # 获取连接时间
            contime = contime.replace('秒', '')

            ip = td_list[0]
            port = td_list[1]
            contype = td_list[2]
            alive = td_list[3]
            if float(speed) < 1:  # 过滤速度
                if float(contime) < 1:  # 过滤连接时间
                    if '天' in alive:  # 过滤存活时间
                        alive = alive.replace('天', '')
                        if int(alive) > 10:
                            print(ip, port, contype, alive, speed, contime)
                            proxies = []
                            proxy = {}  # 代理字典
                            proxy[contype] = contype + '://' + ip + ':' + port
                            proxies.append(proxy)
                            # get_ip_list = random.choice(proxies)  # 随机选择代理池proies中的一个代理
                            # 检测获取到的代理是否能用
                            for proxy in proxies:
                                print(proxy)
                                base_url = 'https://www.baidu.com/'
                                # 验证爬取的IP是否可用(通过访问百度的状态码来判断)
                                # 使用urllib时状态码获取是 .code , 而使用requests是 .status_code
                                # code = requests.get(ip_url, proxies=proxy, headers=headers).status_code
                                code = requests.get(
                                    base_url, proxies=proxy,
                                    headers=headers).status_code
                                proies = []
                                print(code)
                                if code == 200:  # 如果能正常访问百度,即可用IP,就存储到文件中
                                    proies.append(proxy)
                                    with open('get_ip_list', 'r+') as f:
                                        ips = f.readlines()
                                        ip = ' '.join([ip, port, contype
                                                       ]) + '\n'
                                        if ip in ips:
                                            print('重复')
                                        else:
                                            print('不重复')
                                            f.write(ip)
                                            f.close()
                                else:  # 若不能正常访问百度,则直接pass
                                    pass