def parse(self, html): auction_list = html.xpath("//dl/dd/a/@href") for auction_url in auction_list: try: url = 'http://www.shjiapai.cn' + auction_url auction_res = requests.get(url, headers=self.headers) con = auction_res.text auction_id = re.search('id/(\d+).html', auction_url).group(1) if not check_auction(source=source, auction_id=auction_id): auction = Auction(source=source, auction_type=auction_type) auction.source_html = con auction.auction_id = auction_id auction.auction_name = re.search('楼盘名称.*?">(.*?)</td', con, re.S | re.M).group(1) auction.city = '上海' auction.html_type = '房产' auction.start_auction_price = re.search( '预计售价.*?">(.*?)</td', con, re.S | re.M).group(1) auction.floor = re.search('层.*?">(.*?)楼</td', con, re.S | re.M).group(1) auction.area = re.search('户型面积.*?">(.*?)</td', con, re.S | re.M).group(1) auction.build_type = re.search('物业类型.*?">(.*?)</td', con, re.S | re.M).group(1) auction.info = re.search('其它.*?>(.*?)</div', con, re.S | re.M).group(1) auction.insert_db() else: log.info("数据已存在") except Exception as e: log.error("{}解析失败".format(auction_url))
def get_detail(self, id_, auction_time, html_type, auction_type, province, city, region): auction = Auction(source=source, auction_type=auction_type) auction.html_type = html_type auction.auction_type = auction_type auction.province = province auction.city = city auction.region = region detail_url = 'http://www1.rmfysszc.gov.cn/Handle/' + id_ + '.shtml' try: response = requests.get(detail_url, headers=self.headers) html = response.content.decode() auction.source_html = html info_list = [] try: if 'GetRecord()' in html: tree = etree.HTML(html) auction.auction_name = tree.xpath( '//div[@id="Title"]/h1/text()')[0] start_auction_price = tree.xpath( '//*[@id="price"]/div[1]/span/text()')[0] auction.start_auction_price = self.get_float( start_auction_price) assess_value = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[1]/td/span[2]/text()' )[0] try: auction.assess_value = self.get_float(assess_value) except Exception as e: auction.assess_value = None earnest_money = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[2]/td/span[2]/text()' )[0] auction.earnest_money = self.get_float(earnest_money) announcement_date = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[3]/td/span/text()')[0] announcement_date_ = re.search(': (.*?)$', announcement_date, re.S | re.M).group(1) auction.announcement_date = datetime.datetime.strptime( announcement_date_, "%Y.%m.%d") auction_level = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[4]/td/span/text()')[0] auction.auction_level = re.search(': (.*?)$', auction_level, re.S | re.M).group(1) court = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[1]/td/span/text()')[0] auction.court = re.search(': (.*?)$', court, re.S | re.M).group(1) info_list.append( tree.xpath( 'string(//*[@id="bdjs11"])').encode().decode()) info_list.append( tree.xpath( 'string(//*[@id="jjjl"])').encode().decode()) contacts = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[2]/td/span/text()')[0] auction.contacts = re.search(': (.*?)$', contacts, re.S | re.M).group(1) phone_number = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[3]/td/span/text()')[0] auction.phone_number = re.search(': (.*?)$', phone_number, re.S | re.M).group(1) auction.info = info_list try: auction.build_type = tree.xpath( '//*[@id="bdjs11"]/table[1]/tr[2]/td[4]/text()')[0] except Exception as e: auction.build_type = None auction.auction_id = id_ auction.auction_time = self.get_date(date=auction_time) auction.insert_db() elif 'bmnumber()' in html: tree = etree.HTML(html) auction.auction_name = tree.xpath( '//div[@id="Title"]/h1/text()')[0] start_auction_price = tree.xpath( '//*[@id="price"]/div[1]/span/text()')[0] auction.start_auction_price = self.get_float( start_auction_price) assess_value = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[1]/td/span[2]/text()' )[0] auction.assess_value = self.get_float(assess_value) earnest_money = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[2]/td/span[2]/text()' )[0] auction.earnest_money = self.get_float(earnest_money) announcement_date = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[3]/td/span/text()')[0] announcement_date_ = re.search(': (.*?)$', announcement_date, re.S | re.M).group(1) auction.announcement_date = datetime.datetime.strptime( announcement_date_, "%Y-%m-%d") auction_level = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[4]/td/span/text()')[0] auction.auction_level = re.search(': (.*?)$', auction_level, re.S | re.M).group(1) court = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[1]/td/span/text()')[0] auction.court = re.search(': (.*?)$', court, re.S | re.M).group(1) info_list.append( tree.xpath( 'string(//*[@id="bdjs"])').encode().decode()) contacts = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[2]/td/span/text()')[0] auction.contacts = re.search(': (.*?)$', contacts, re.S | re.M).group(1) phone_number = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[3]/td/span/text()')[0] auction.phone_number = re.search(': (.*?)$', phone_number, re.S | re.M).group(1) auction.info = info_list try: auction.build_type = tree.xpath( '//*[@id="bdjs11"]/table[1]/tr[2]/td[4]/text()')[0] except Exception as e: auction.build_type = None auction.auction_id = id_ auction.auction_time = self.get_date(date=auction_time) auction.insert_db() else: tree = etree.HTML(html) auction.auction_name = tree.xpath( '//*[@id="xmgg"]/div/div[1]/text()')[0] assess_value = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[3]/span/text()' )[0] auction.assess_value = self.get_float(assess_value) announcement_date = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[2]/span/text()' )[0] try: auction.announcement_date = datetime.datetime.strptime( announcement_date, "%Y-%m-%d") except Exception as e: auction.announcement_date = datetime.datetime.strptime( announcement_date, "%Y/%m/%d") auction.court = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[1]/span/text()' )[0] info_list.append( tree.xpath( 'string(//*[@id="bdxx"]/div)').encode().decode()) info_list.append( tree.xpath('string(//*[@id="tjzl"]/div/div[2])'). encode().decode()) auction.contacts = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[4]/span/text()' )[0] auction.phone_number = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[5]/span/text()' )[0] auction.info = info_list try: auction.build_type = tree.xpath( '//*[@id="bdxx"]/div/div[2]/table/tr[2]/td[3]/text()' )[0] except Exception as e: auction.build_type = None auction.auction_id = id_ auction.auction_time = self.get_date(date=auction_time) auction.insert_db() except Exception as e: log.error('解析错误,url="{}",e="{}"'.format(detail_url, e)) except Exception as e: log.error('详情页请求错误,url="{}",e="{}"'.format(detail_url, e))