def detail_parse(auction_res, auction_type, html_type, auction_id): con = auction_res.json() auction = Auction(source=source, auction_type=auction_type) auction.source_html = con auction.html_type = html_type auction.auction_id = auction_id auction.auction_name = con['object_title'] auction.start_auction_price = con['start_price'] auction.assess_value = con['appraise_price'] auction.earnest_money = con['bond_price'] auction.court = con['court_name'] auction_time = con['start_time'] location = con['location'] auction.auction_time = datetime.datetime.strptime( auction_time, "%Y-%m-%d %H:%M:%S") province, city, region = location.split(' ') auction.province = province auction.city = city auction.region = region if html_type == '房产': auction.floor = con['detail']['house_floor'] auction.area = con['detail']['gross_floor_area'] elif html_type == '土地': auction.area = con['detail']['l_land_area'] auction.insert_db()
def parse(self, html): auction_list = html.xpath("//dl/dd/a/@href") for auction_url in auction_list: try: url = 'http://www.shjiapai.cn' + auction_url auction_res = requests.get(url, headers=self.headers) con = auction_res.text auction_id = re.search('id/(\d+).html', auction_url).group(1) if not check_auction(source=source, auction_id=auction_id): auction = Auction(source=source, auction_type=auction_type) auction.source_html = con auction.auction_id = auction_id auction.auction_name = re.search('楼盘名称.*?">(.*?)</td', con, re.S | re.M).group(1) auction.city = '上海' auction.html_type = '房产' auction.start_auction_price = re.search( '预计售价.*?">(.*?)</td', con, re.S | re.M).group(1) auction.floor = re.search('层.*?">(.*?)楼</td', con, re.S | re.M).group(1) auction.area = re.search('户型面积.*?">(.*?)</td', con, re.S | re.M).group(1) auction.build_type = re.search('物业类型.*?">(.*?)</td', con, re.S | re.M).group(1) auction.info = re.search('其它.*?>(.*?)</div', con, re.S | re.M).group(1) auction.insert_db() else: log.info("数据已存在") except Exception as e: log.error("{}解析失败".format(auction_url))
def get_list_info(self, url_page, html_type, auction_type): response = requests.get(url_page, headers=self.headers) html = response.text tree = etree.HTML(html) div_list = tree.xpath('//div[@class="sflistdiv"]') for i in div_list: info = [] auction = Auction(source, auction_type) auction.province = '上海' auction.city = '上海' auction.html_type = html_type auction.source_html = html auction_id = i.xpath( 'div[@class="sflistdivn2"]/div[@class="f20hei"]/a/@href' )[0].split('/')[-1] is_exist = coll.find_one({ 'auction_id': str(auction_id), 'source': source }) if is_exist: log.info('id已存在,id="{}"'.format(str(auction_id))) continue auction.auction_id = auction_id try: auction_name_ = i.xpath( 'div[@class="sflistdivn2"]/div[@class="f20hei"]/a/text()' )[0] except Exception as e: auction_name_ = '' region = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflistban"]/text()')[0] auction.region = re.search(' - (.*?)$', region, re.S | re.M).group(1) auction_time_ = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflisttime"]/text()')[0] address = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflistcan"]/text()' )[3].encode().decode() auction.auction_name = auction_name_ + address try: auction_time = re.search('拍卖时间:(.*?)$', auction_time_, re.S | re.M).group(1) auction.auction_time = datetime.datetime.strptime( auction_time, "%y.%m.%d") except Exception as e: auction.auction_time = None info.append(i.xpath('string(div[@class="sflistdivn2"])')) area_ = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflistcan"]/span[1]/text()' )[0] auction.area = re.search('面积:(.*?)$', area_, re.S | re.M).group(1) floor = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflistcan"]/span[3]/text()' )[0] auction.floor = re.search('楼层:(.*?)$', floor, re.S | re.M).group(1) start_auction_price = i.xpath('//div[@class="f34hong"]/text()')[0] auction.start_auction_price = float( re.search('(\d+),?(\d+)', start_auction_price, re.S | re.M).group(1).replace(',', '')) * 10000 auction.insert_db()
def get_detail(self, aution_url, aution_id, aution_time, region_name, city_name, html_type, auction_type): info = [] aution = Auction(source, auction_type) response = requests.get(aution_url, headers=self.headers) try: html = response.text tree = etree.HTML(html) aution.auction_id = aution_id aution.region = region_name aution.city = city_name aution.source_html = html aution.html_type = html_type try: aution.start_auction_price = float( tree.xpath('//*[@id="Price_Start"]/text()')[0].replace( ',', '')) except Exception as e: aution.start_auction_price = None if 'item2' in aution_url: aution.auction_name = tree.xpath( '//div[@class="d-m-title"]/b/text()')[0] aution.auction_level = tree.xpath( '//div[@class="d-m-tb"]/table[1]/tr[1]/td[2]/text()')[0] try: assess_value = tree.xpath( '//div[@class="d-m-tb"]/table[1]/tr[4]/td[1]/text()' )[0] aution.assess_value = float( re.search('(\d+),?(\d+)', assess_value, re.S | re.M).group(1).replace(',', '')) except Exception as e: aution.assess_value = None earnest_money = tree.xpath( '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()')[0] aution.earnest_money = float( re.search('(\d+),?(\d+)', earnest_money, re.S | re.M).group(1).replace(',', '')) court = tree.xpath('//td[@class="pr7"]/text()')[0] aution.court = re.search('法院:(.*?)$', court, re.S | re.M).group(1) aution.contacts = tree.xpath('//td[@valign="top"]/text()')[0] phone_number = tree.xpath('//td[@colspan="2"]/text()')[0] try: aution.phone_number = re.search('联系电话:(.*?)$', phone_number, re.S | re.M).group(1) except Exception as e: aution.phone_number = None info.append( tree.xpath( 'string(//div[@class="panel-con"]/div[@class="d-block"][2])' )) info.append( tree.xpath( 'string(//div[@class="panel-con"]/div[@class="d-article d-article2"][3])' )) aution.info = info if aution_time: aution.auction_time = datetime.datetime.strptime( aution_time, "%Y-%m-%d %H:%M:%S") else: aution.auction_name = tree.xpath( '//div[@class="DivItemName"]/text()')[0] aution.auction_level = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[4]/text()' )[0] try: assess_value = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[5]/text()' )[0] aution.assess_value = float( re.search('(\d+),?(\d+)', assess_value, re.S | re.M).group(1).replace(',', '')) except Exception as e: aution.assess_value = None earnest_money = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[6]/text()' )[0] aution.earnest_money = float( re.search('(\d+),?(\d+)', earnest_money, re.S | re.M).group(1).replace(',', '')) court = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[8]/text()' )[0] aution.court = re.search('法院:(.*?)$', court, re.S | re.M).group(1) area = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[2]/text()' )[0] aution.area = float( re.search('(\d+)\.(\d+)', area, re.S | re.M).group(1).replace(',', '')) info.append(tree.xpath('string(//div[@id="Tab1"])')) info.append( tree.xpath('string(//div[@class="bootstrap-table"])')) aution.info = info if aution_time: aution.auction_time = datetime.datetime.strptime( aution_time, "%Y-%m-%d %H:%M:%S") aution.insert_db() except Exception as e: log.error('解析错误,url="{}",e="{}"'.format(aution_url, e))
def detail_parse(self, **kwargs): auction = Auction(source='ali') auction.auction_name = kwargs['auction_name'] auction.auctionId = kwargs['auction_id'] auction.biddingState = kwargs['status'] auction.curPrice = kwargs['current_price'] auction.evalPrice = kwargs['evalprice'] auction.startShootingDate = kwargs['start_time'] auction.endShootingDate = kwargs['end_time'] auction.participantsNumber = kwargs['partnumber'] auction.visitCount = kwargs['visitCount'] detail_url = kwargs['url'] auction.url = detail_url try: # todo 调用黄村粮的方法进行切割获取城市,区域,小区名,经纬度等 cut_info = CutMatch.to_match('上海', kwargs['auction_name']) # 切割后匹配库中的城市 auction.matchCity = cut_info['matchCity'] # 切割后匹配库中的区域 auction.matchRegion = cut_info['matchRegion'] # 切割后匹配库中的小区名称 auction.matchName = cut_info['matchName'] # 切割后匹配库中的地址 auction.matchAddress = cut_info['matchAddress'] # 切割后的房号 auction.roomNum = cut_info['cutRoomnum'] # 切割后的楼号 auction.houseNum = cut_info['cutHousenum'] # 切割后的城市 auction.cutCity = cut_info['cutCity'] # 切割后的区域 auction.cutRegion = cut_info['cutRegion'] # 切割后的小区名称 auction.cutName = cut_info['cutName'] # 切割后的地址 auction.cutAddress = cut_info['cutAddress'] # 切割后跑高德接口得到的经纬度 auction.lat = cut_info['mapLat'] auction.lng = cut_info['mapLng'] except Exception as e: log.error(e) return try: detail_res = requests.get(url=detail_url, headers=self.headers) except Exception as e: log.error('url={}, e={}'.format(detail_url, e)) return html = etree.HTML(detail_res.content.decode('gbk')) try: title = html.xpath( '//div[contains(@class,"pm-main clearfix")]/h1/text()' )[0].strip() except: log.error('没有标题 url={}'.format(detail_url)) return auctionStage = re.search('【(.*?)】', title).group(1) auction.auctionStage = auctionStage auction.auctionCount = self.get_auctionCount(auctionStage) startPrice = re.search('起拍价¥(.*?) ,', detail_res.content.decode('gbk')).group(1) bond = re.search('保 证 金.*?J_Price">(.*?)</span', detail_res.content.decode('gbk'), re.S | re.M).group(1) comm_url = 'http://sf.taobao.com/json/getGovItemSummary.htm?itemId={}'.format( kwargs['auction_id']) res = requests.get(comm_url, headers=self.headers) try: auction.area = float( int(res.json()['props']['area']['value']) / 100) except: pass images = html.xpath("//div[@class='pm-pic pm-s80 ']/a/img/@src") image_list = [] for image_url in images: big_img = image_url.replace('_80x80.jpg', '') image = qiniufetch(big_img, big_img) image_list.append(image) auction.houseImgUrls = image_list auction.startPrice = float(float(startPrice.replace(',', '')) / 10000) auction.bond = float(float(bond.replace(',', '').strip()) / 10000) if kwargs['status'] == '已成交': if re.search( '失败|流拍', html.xpath('//h1[@class="bid-fail"]/text()')[0]) is None: auction.update() else: auction.biddingState = '流拍' auction.update() else: auction.update()