def parse_list(base_url): # 加载代理池 proxies = IpProxyPool.get_proxy() # 调用代理IP proxy = random.choice(proxies) print(proxy) req = requests.get(base_url, proxies=proxy) html = req.content.decode('gbk') html = etree.HTML(html) print(html) # 匹配规则不完整,需要组合,得到的str0是所需的整体匹配规则 for i in range(30): if i < 9: lists = str(0) + str(i + 1) str0 = '//div[@class="houseList"]/dl[@id="list_D03_' + lists + '"]' else: str0 = '//div[@class="houseList"]/dl[@id="list_D03_' + str( i + 1) + '"]' # 第一个分匹配规则(匹配title) #str1 = str0 + '//p[@class="title"]//text()' house_description = html.xpath(str0 + '//p[@class="title"]//text()')[0] house_description = comma(house_description) # 第二个分匹配规则(匹配房屋信息) #str2 = str0 + '//p[@class="mt12"]//text()' house = html.xpath(str0 + '//p[@class="mt12"]//text()') print(house, len(house)) if len(house) == 7: # 调用函数 room_num = wrap(house[0]) room_num = comma(room_num) house_floor = wrap(house[2]) house_floor = comma(house_floor) house_orientation = wrap(house[4]) house_orientation = comma(house_orientation) architectural_age = wrap(house[6]).split(':')[1] print(room_num, house_floor, house_orientation, architectural_age) elif len(house) == 5: if u'室' not in house[0]: room_num = u'无' house_floor = wrap(house[0]) house_orientation = wrap(house[2]) architectural_age = wrap(house[4]).split(':')[1] print(room_num, house_floor, house_orientation, architectural_age) elif u'层' not in house[2]: room_num = wrap(house[0]) house_floor = u'无' house_orientation = wrap(house[2]) architectural_age = wrap(house[4]).split(':')[1] print(room_num, house_floor, house_orientation, architectural_age) elif u'向' not in house[4]: room_num = wrap(house[0]) house_floor = wrap(house[2]) house_orientation = u'无' architectural_age = wrap(house[4]).split(':')[1] print(room_num, house_floor, house_orientation, architectural_age) else: room_num = wrap(house[0]) house_floor = wrap(house[2]) house_orientation = wrap(house[4]) architectural_age = u'无' print(room_num, house_floor, house_orientation, architectural_age) else: room_num = wrap(house[0]) house_floor = wrap(house[2]) house_orientation = u'无' architectural_age = u'无' print(room_num, house_floor, house_orientation, architectural_age) # 第三个分匹配规则(匹配房源信息) #str3 = str0 + '//p[@class="mt10"]//text()' addr = html.xpath(str0 + '//p[@class="mt10"]//text()') print(addr, len(addr)) if len(addr) == 5: community_name = addr[1] house_address = addr[3] print(community_name, house_address) # 获得房子所属片区 house_area = house_address.split("-")[0] house_addr = house_address.split("-")[1] print(house_area, house_addr) # 第四个分匹配规则(匹配附近地铁站) #str4 = str0 + '//div[contains(@class,"mt8")]//text()' subway = html.xpath(str0 + '//div[contains(@class,"mt8")]//text()') print(subway, len(subway)) if len(subway) == 8: subway_name = subway[4] print(subway_name) elif len(subway) == 7: if u'满' in subway[3]: subway_name = u'无' print(subway_name) else: subway_name = subway[3] print(subway_name) elif len(subway) == 6: if u'满' in subway[2]: subway_name = u'无' print(subway_name) else: subway_name = subway[2] print(subway_name) else: subway_name = u'无' print(subway_name) subway_name = comma(subway_name) # 第五个分匹配规则(匹配房子建筑面积) str5 = str0 + '//div[contains(@class,"area")]//text()' area = html.xpath(str5) print(area, len(area)) if len(area) == 5: building_area = area[1].replace('㎡', '') print('%s:' % area[3], building_area) # 第六个分匹配规则(匹配房子价格) #str6 = str0 + '//div[@class="moreInfo"]//text()' price = html.xpath(str0 + '//div[@class="moreInfo"]//text()') print(price, len(price)) if len(price) == 8: setof_price = price[1] avg_price = price[4].replace(u'元', '') print(setof_price, avg_price) # print(house_description, room_num, house_floor, house_orientation, architectural_age, community_name, house_area, house_addr, subway_name, building_area, setof_price, avg_price) with open('house_list1.csv', 'a+', encoding='gbk') as f: f.writelines(','.join([ house_description, room_num, house_floor, house_orientation, architectural_age, community_name, house_area, house_addr, subway_name, building_area, setof_price, avg_price, '\n' ])) f.close()
return data # 替换数据中的逗号 def comma(data): if ',' in data: data = data.replace(',', ' ') elif ',' in data: data = data.replace(',', ' ') else: data = data return data # 加载代理池 proxies = IpProxyPool.get_proxy() # def parse_list(base_url, mydb): def parse_list(base_url): # 调用代理IP proxy = random.choice(proxies) print(proxy) req = requests.get(base_url, proxies=proxy) html = req.content.decode('gbk') html = etree.HTML(html) # print(html) # 匹配规则不完整,需要组合,得到的str0是所需的整体匹配规则 for i in range(30): if i < 9:
def parse_list(self, ip_url='http://www.xicidaili.com/nn'): # 调用代理IP proxies = IpProxyPool.get_proxy() proxy = random.choice(proxies) print(proxy) # 访问西刺代理网站,来获取免费可用代理 # req = requests.get(ip_url) headers = { 'User-Agent': '%s' % self.user_agent_list, } # headers = random.choice(self.user_agent_list) # code = requests.get(ip_url, proxies=proxy, headers=headers).status_code # print(code) req = requests.get(ip_url, headers=headers, proxies=proxy) html = req.content.decode('utf-8') # print(html) tr_pattern = re.compile(r'<tr.*?>.*?</tr>', re.S) # re.S让.可以匹配换行 tr_list = tr_pattern.findall(html)[1:] # with open('get_ip_list', 'r+', encoding='utf-8') as f: for tr in tr_list: td_pattern = re.compile('<td>(.*?)</td>') info_pattern = re.compile(r'title="(.*?)".*?title="(.*?)"', re.S) td_list = td_pattern.findall(tr) info_list = info_pattern.findall(tr) info_list = info_list[0] speed = info_list[0] # 获取连接速度 speed = speed.replace('秒', '') contime = info_list[1] # 获取连接时间 contime = contime.replace('秒', '') ip = td_list[0] port = td_list[1] contype = td_list[2] alive = td_list[3] if float(speed) < 1: # 过滤速度 if float(contime) < 1: # 过滤连接时间 if '天' in alive: # 过滤存活时间 alive = alive.replace('天', '') if int(alive) > 10: print(ip, port, contype, alive, speed, contime) proxies = [] proxy = {} # 代理字典 proxy[contype] = contype + '://' + ip + ':' + port proxies.append(proxy) # get_ip_list = random.choice(proxies) # 随机选择代理池proies中的一个代理 # 检测获取到的代理是否能用 for proxy in proxies: print(proxy) base_url = 'https://www.baidu.com/' # 验证爬取的IP是否可用(通过访问百度的状态码来判断) # 使用urllib时状态码获取是 .code , 而使用requests是 .status_code # code = requests.get(ip_url, proxies=proxy, headers=headers).status_code code = requests.get( base_url, proxies=proxy, headers=headers).status_code proies = [] print(code) if code == 200: # 如果能正常访问百度,即可用IP,就存储到文件中 proies.append(proxy) with open('get_ip_list', 'r+') as f: ips = f.readlines() ip = ' '.join([ip, port, contype ]) + '\n' if ip in ips: print('重复') else: print('不重复') f.write(ip) f.close() else: # 若不能正常访问百度,则直接pass pass