Beispiel #1
0
 def get_all_url(self):
     for i in self.city_code:
         city,region_id = i.code.split(',')
         city_name = i.city
         province = i.province
         region = i.region
         for type in self.map:
             url = 'https://' + city + '.51zhupai.com/' + type.code + '/' + region_id
             auction_type = type.auction_type
             html_type = type.html_type
             response = requests.get(url, headers=self.headers)
             html = response.text
             tree = etree.HTML(html)
             page = tree.xpath('//a[@class="pageTotle"][4]/text()')[0]
             for p in range(1, int(page) + 1):
                 page_url = url + 'n' + str(p)
                 res = requests.get(page_url, headers=self.headers)
                 html_ = res.text
                 tree_ = etree.HTML(html_)
                 url_list_ = tree_.xpath('//ul[contains(@class,"list_content_ul")]/li/a/@href')
                 for url_ in url_list_:
                     url_real = 'https://' + city + '.51zhupai.com' + url_
                     id_ = url_real.split('/')[-1]
                     is_exies = check_auction(source, id_)
                     if is_exies:
                         log.info('id已存在,id="{}"'.format(str(id_)))
                         continue
                     self.get_detail(url_real, city_name, auction_type, html_type,id_,province,region)
Beispiel #2
0
 def parse(self, html):
     auction_list = html.xpath("//dl/dd/a/@href")
     for auction_url in auction_list:
         try:
             url = 'http://www.shjiapai.cn' + auction_url
             auction_res = requests.get(url, headers=self.headers)
             con = auction_res.text
             auction_id = re.search('id/(\d+).html', auction_url).group(1)
             if not check_auction(source=source, auction_id=auction_id):
                 auction = Auction(source=source, auction_type=auction_type)
                 auction.source_html = con
                 auction.auction_id = auction_id
                 auction.auction_name = re.search('楼盘名称.*?">(.*?)</td', con,
                                                  re.S | re.M).group(1)
                 auction.city = '上海'
                 auction.html_type = '房产'
                 auction.start_auction_price = re.search(
                     '预计售价.*?">(.*?)</td', con, re.S | re.M).group(1)
                 auction.floor = re.search('层.*?">(.*?)楼</td', con,
                                           re.S | re.M).group(1)
                 auction.area = re.search('户型面积.*?">(.*?)</td', con,
                                          re.S | re.M).group(1)
                 auction.build_type = re.search('物业类型.*?">(.*?)</td', con,
                                                re.S | re.M).group(1)
                 auction.info = re.search('其它.*?>(.*?)</div', con,
                                          re.S | re.M).group(1)
                 auction.insert_db()
             else:
                 log.info("数据已存在")
         except Exception as e:
             log.error("{}解析失败".format(auction_url))
Beispiel #3
0
 def html_fetch(self,max_page,province_name,city_name,type_name,auction_type):
     if max_page is not None:
         for i in range(1, max_page+1):
             self.data['page'] = i
             res = requests.post(self.start_url, data=self.data, headers=self.headers)
             url_list = re.findall('index/index/info/biao_id/(.*?)"', res.text, re.S | re.M)
             for auction_id in url_list:
                 if not check_auction(source=source, auction_id=auction_id):
                     self.crawler_detail_page(auction_id,province_name, city_name,type_name,auction_type)
                 else:
                     log.info('数据库已经存在')
Beispiel #4
0
 def id_check(self, auction_type, html_type, res):
     for i in res.json()['object']:
         try:
             auction_id = i['djlsh']
             auction_url = 'http://api.faepai.com/index.php/Web/InterfaceV2/getObjectDetail?object_id=' + str(
                 auction_id)
             try:
                 auction_res = requests.get(auction_url,
                                            headers=self.headers)
             except:
                 log.error("{}请求失败".format(auction_url))
                 continue
             if not check_auction(source=source, auction_id=auction_id):
                 self.detail_parse(auction_res, auction_type, html_type,
                                   auction_id)
             else:
                 log.info("数据已存在")
         except Exception as e:
             log.error("解析失败{}".format(e))