def parse_list(base_url):

    # 加载代理池
    proxies = IpProxyPool.get_proxy()
    # 调用代理IP
    proxy = random.choice(proxies)
    print(proxy)

    req = requests.get(base_url, proxies=proxy)
    html = req.content.decode('gbk')
    html = etree.HTML(html)
    print(html)

    # 匹配规则不完整,需要组合,得到的str0是所需的整体匹配规则
    for i in range(30):
        if i < 9:
            lists = str(0) + str(i + 1)
            str0 = '//div[@class="houseList"]/dl[@id="list_D03_' + lists + '"]'
        else:
            str0 = '//div[@class="houseList"]/dl[@id="list_D03_' + str(
                i + 1) + '"]'

        # 第一个分匹配规则(匹配title)
        #str1 = str0 + '//p[@class="title"]//text()'

        house_description = html.xpath(str0 + '//p[@class="title"]//text()')[0]
        house_description = comma(house_description)

        # 第二个分匹配规则(匹配房屋信息)
        #str2 = str0 + '//p[@class="mt12"]//text()'

        house = html.xpath(str0 + '//p[@class="mt12"]//text()')
        print(house, len(house))

        if len(house) == 7:

            # 调用函数
            room_num = wrap(house[0])
            room_num = comma(room_num)
            house_floor = wrap(house[2])
            house_floor = comma(house_floor)
            house_orientation = wrap(house[4])
            house_orientation = comma(house_orientation)
            architectural_age = wrap(house[6]).split(':')[1]
            print(room_num, house_floor, house_orientation, architectural_age)

        elif len(house) == 5:

            if u'室' not in house[0]:
                room_num = u'无'
                house_floor = wrap(house[0])
                house_orientation = wrap(house[2])
                architectural_age = wrap(house[4]).split(':')[1]
                print(room_num, house_floor, house_orientation,
                      architectural_age)

            elif u'层' not in house[2]:
                room_num = wrap(house[0])
                house_floor = u'无'
                house_orientation = wrap(house[2])
                architectural_age = wrap(house[4]).split(':')[1]
                print(room_num, house_floor, house_orientation,
                      architectural_age)

            elif u'向' not in house[4]:
                room_num = wrap(house[0])
                house_floor = wrap(house[2])
                house_orientation = u'无'
                architectural_age = wrap(house[4]).split(':')[1]
                print(room_num, house_floor, house_orientation,
                      architectural_age)

            else:
                room_num = wrap(house[0])
                house_floor = wrap(house[2])
                house_orientation = wrap(house[4])
                architectural_age = u'无'
                print(room_num, house_floor, house_orientation,
                      architectural_age)

        else:
            room_num = wrap(house[0])
            house_floor = wrap(house[2])
            house_orientation = u'无'
            architectural_age = u'无'
            print(room_num, house_floor, house_orientation, architectural_age)

        # 第三个分匹配规则(匹配房源信息)
        #str3 = str0 + '//p[@class="mt10"]//text()'

        addr = html.xpath(str0 + '//p[@class="mt10"]//text()')

        print(addr, len(addr))

        if len(addr) == 5:
            community_name = addr[1]
            house_address = addr[3]
            print(community_name, house_address)
            # 获得房子所属片区
            house_area = house_address.split("-")[0]
            house_addr = house_address.split("-")[1]
            print(house_area, house_addr)

        # 第四个分匹配规则(匹配附近地铁站)
        #str4 = str0 + '//div[contains(@class,"mt8")]//text()'

        subway = html.xpath(str0 + '//div[contains(@class,"mt8")]//text()')
        print(subway, len(subway))

        if len(subway) == 8:
            subway_name = subway[4]
            print(subway_name)
        elif len(subway) == 7:
            if u'满' in subway[3]:
                subway_name = u'无'
                print(subway_name)
            else:
                subway_name = subway[3]
                print(subway_name)
        elif len(subway) == 6:
            if u'满' in subway[2]:
                subway_name = u'无'
                print(subway_name)
            else:
                subway_name = subway[2]
                print(subway_name)
        else:
            subway_name = u'无'
            print(subway_name)
        subway_name = comma(subway_name)

        # 第五个分匹配规则(匹配房子建筑面积)
        str5 = str0 + '//div[contains(@class,"area")]//text()'
        area = html.xpath(str5)
        print(area, len(area))

        if len(area) == 5:
            building_area = area[1].replace('㎡', '')
            print('%s:' % area[3], building_area)

        # 第六个分匹配规则(匹配房子价格)
        #str6 = str0 + '//div[@class="moreInfo"]//text()'

        price = html.xpath(str0 + '//div[@class="moreInfo"]//text()')
        print(price, len(price))

        if len(price) == 8:
            setof_price = price[1]
            avg_price = price[4].replace(u'元', '')
            print(setof_price, avg_price)
            # print(house_description, room_num, house_floor, house_orientation, architectural_age, community_name, house_area, house_addr, subway_name, building_area, setof_price, avg_price)
            with open('house_list1.csv', 'a+', encoding='gbk') as f:
                f.writelines(','.join([
                    house_description, room_num, house_floor,
                    house_orientation, architectural_age, community_name,
                    house_area, house_addr, subway_name, building_area,
                    setof_price, avg_price, '\n'
                ]))
                f.close()
Example #2
0
    return data


# 替换数据中的逗号
def comma(data):
    if ',' in data:
        data = data.replace(',', ' ')
    elif ',' in data:
        data = data.replace(',', ' ')
    else:
        data = data
    return data


# 加载代理池
proxies = IpProxyPool.get_proxy()


# def parse_list(base_url, mydb):
def parse_list(base_url):

    # 调用代理IP
    proxy = random.choice(proxies)
    print(proxy)
    req = requests.get(base_url, proxies=proxy)
    html = req.content.decode('gbk')
    html = etree.HTML(html)
    # print(html)
    # 匹配规则不完整,需要组合,得到的str0是所需的整体匹配规则
    for i in range(30):
        if i < 9:
Example #3
0
    def parse_list(self, ip_url='http://www.xicidaili.com/nn'):
        # 调用代理IP
        proxies = IpProxyPool.get_proxy()
        proxy = random.choice(proxies)
        print(proxy)
        # 访问西刺代理网站,来获取免费可用代理
        # req = requests.get(ip_url)
        headers = {
            'User-Agent': '%s' % self.user_agent_list,
        }
        # headers = random.choice(self.user_agent_list)
        # code = requests.get(ip_url, proxies=proxy, headers=headers).status_code
        # print(code)
        req = requests.get(ip_url, headers=headers, proxies=proxy)
        html = req.content.decode('utf-8')
        # print(html)
        tr_pattern = re.compile(r'<tr.*?>.*?</tr>', re.S)  # re.S让.可以匹配换行
        tr_list = tr_pattern.findall(html)[1:]

        # with open('get_ip_list', 'r+', encoding='utf-8') as f:
        for tr in tr_list:
            td_pattern = re.compile('<td>(.*?)</td>')
            info_pattern = re.compile(r'title="(.*?)".*?title="(.*?)"', re.S)
            td_list = td_pattern.findall(tr)
            info_list = info_pattern.findall(tr)
            info_list = info_list[0]

            speed = info_list[0]  # 获取连接速度
            speed = speed.replace('秒', '')

            contime = info_list[1]  # 获取连接时间
            contime = contime.replace('秒', '')

            ip = td_list[0]
            port = td_list[1]
            contype = td_list[2]
            alive = td_list[3]
            if float(speed) < 1:  # 过滤速度
                if float(contime) < 1:  # 过滤连接时间
                    if '天' in alive:  # 过滤存活时间
                        alive = alive.replace('天', '')
                        if int(alive) > 10:
                            print(ip, port, contype, alive, speed, contime)
                            proxies = []
                            proxy = {}  # 代理字典
                            proxy[contype] = contype + '://' + ip + ':' + port
                            proxies.append(proxy)
                            # get_ip_list = random.choice(proxies)  # 随机选择代理池proies中的一个代理
                            # 检测获取到的代理是否能用
                            for proxy in proxies:
                                print(proxy)
                                base_url = 'https://www.baidu.com/'
                                # 验证爬取的IP是否可用(通过访问百度的状态码来判断)
                                # 使用urllib时状态码获取是 .code , 而使用requests是 .status_code
                                # code = requests.get(ip_url, proxies=proxy, headers=headers).status_code
                                code = requests.get(
                                    base_url, proxies=proxy,
                                    headers=headers).status_code
                                proies = []
                                print(code)
                                if code == 200:  # 如果能正常访问百度,即可用IP,就存储到文件中
                                    proies.append(proxy)
                                    with open('get_ip_list', 'r+') as f:
                                        ips = f.readlines()
                                        ip = ' '.join([ip, port, contype
                                                       ]) + '\n'
                                        if ip in ips:
                                            print('重复')
                                        else:
                                            print('不重复')
                                            f.write(ip)
                                            f.close()
                                else:  # 若不能正常访问百度,则直接pass
                                    pass