def get_all_url(self): for i in self.city_code: city,region_id = i.code.split(',') city_name = i.city province = i.province region = i.region for type in self.map: url = 'https://' + city + '.51zhupai.com/' + type.code + '/' + region_id auction_type = type.auction_type html_type = type.html_type response = requests.get(url, headers=self.headers) html = response.text tree = etree.HTML(html) page = tree.xpath('//a[@class="pageTotle"][4]/text()')[0] for p in range(1, int(page) + 1): page_url = url + 'n' + str(p) res = requests.get(page_url, headers=self.headers) html_ = res.text tree_ = etree.HTML(html_) url_list_ = tree_.xpath('//ul[contains(@class,"list_content_ul")]/li/a/@href') for url_ in url_list_: url_real = 'https://' + city + '.51zhupai.com' + url_ id_ = url_real.split('/')[-1] is_exies = check_auction(source, id_) if is_exies: log.info('id已存在,id="{}"'.format(str(id_))) continue self.get_detail(url_real, city_name, auction_type, html_type,id_,province,region)
def parse(self, html): auction_list = html.xpath("//dl/dd/a/@href") for auction_url in auction_list: try: url = 'http://www.shjiapai.cn' + auction_url auction_res = requests.get(url, headers=self.headers) con = auction_res.text auction_id = re.search('id/(\d+).html', auction_url).group(1) if not check_auction(source=source, auction_id=auction_id): auction = Auction(source=source, auction_type=auction_type) auction.source_html = con auction.auction_id = auction_id auction.auction_name = re.search('楼盘名称.*?">(.*?)</td', con, re.S | re.M).group(1) auction.city = '上海' auction.html_type = '房产' auction.start_auction_price = re.search( '预计售价.*?">(.*?)</td', con, re.S | re.M).group(1) auction.floor = re.search('层.*?">(.*?)楼</td', con, re.S | re.M).group(1) auction.area = re.search('户型面积.*?">(.*?)</td', con, re.S | re.M).group(1) auction.build_type = re.search('物业类型.*?">(.*?)</td', con, re.S | re.M).group(1) auction.info = re.search('其它.*?>(.*?)</div', con, re.S | re.M).group(1) auction.insert_db() else: log.info("数据已存在") except Exception as e: log.error("{}解析失败".format(auction_url))
def html_fetch(self,max_page,province_name,city_name,type_name,auction_type): if max_page is not None: for i in range(1, max_page+1): self.data['page'] = i res = requests.post(self.start_url, data=self.data, headers=self.headers) url_list = re.findall('index/index/info/biao_id/(.*?)"', res.text, re.S | re.M) for auction_id in url_list: if not check_auction(source=source, auction_id=auction_id): self.crawler_detail_page(auction_id,province_name, city_name,type_name,auction_type) else: log.info('数据库已经存在')
def id_check(self, auction_type, html_type, res): for i in res.json()['object']: try: auction_id = i['djlsh'] auction_url = 'http://api.faepai.com/index.php/Web/InterfaceV2/getObjectDetail?object_id=' + str( auction_id) try: auction_res = requests.get(auction_url, headers=self.headers) except: log.error("{}请求失败".format(auction_url)) continue if not check_auction(source=source, auction_id=auction_id): self.detail_parse(auction_res, auction_type, html_type, auction_id) else: log.info("数据已存在") except Exception as e: log.error("解析失败{}".format(e))