def parse_esf(self, response): province, city = response.meta.get('info') dls = response.xpath('//div[@class="shop_list shop_list_4"]/dl') for dl in dls: item = ESFHouseItem(province=province, city=city) item['name'] = dl.xpath('.//p[@class="add_shop"]/a/@title').get() infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall() infos = list(map(lambda x: re.sub('\s', '', x), infos))[0:-1] for info in infos: if '厅' in info: item['rooms'] = info elif '㎡' in info: item['area'] = info elif '层' in info: item['floor'] = info elif '向' in info: item['toward'] = info else: item['year'] = info item['address'] = dl.xpath( './/p[@class="add_shop"]/span/text()').get() origin_url = response.urljoin(dl.xpath('.//a/@href').get()) item['origin_url'] = origin_url yield scrapy.Request(url=origin_url, callback=self.parse_detail, meta={'info': item}) next_url = response.xpath('//div[@class="page_al"]/p[1]/a/@href').get() yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={'info': (province, city)})
def parse_esf(self,response): province,city = response.meta.get('info') dls = response.xpath("//div[@class='houseList']/dl") for dl in dls: item = ESFHouseItem(province=province,city=city) item['name'] = dl.xpath(".//p[@class='mt10']/a/span/text()").get() infos = dl.xpath(".//p[@class='mt12']/text()").getall() infos = list(map(lambda x:re.sub(r"\s","",x),infos)) for info in infos: if "厅" in info: item['rooms'] = info elif '层' in info: item['floor'] = info elif '向' in info: item['toward'] = info else: item['year'] = info.replace("建筑年代:","") item['address'] = dl.xpath(".//p[@class='mt10']/span/@title").get() item['area'] = dl.xpath(".//div[contains(@class,'area')]/p/text()").get() item['price'] = "".join(dl.xpath(".//div[@class='moreInfo']/p[1]//text()").getall()) item['unit'] = "".join(dl.xpath(".//div[@class='moreInfo']/p[2]//text()").getall()) detail_url = dl.xpath(".//p[@class='title']/a/@href").get() item['origin_url'] = response.urljoin(detail_url) yield item next_url = response.xpath("//a[@id='PageControl1_hlk_next']/@href").get() yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={"info":(province,city)})
def parse_esf(self,response): province,city = response.meta.get('info') dls = response.xpath("//div[contains(@class,'shop_list')]/dl") for dl in dls: item = ESFHouseItem(province=province,city=city) item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").get() infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x:re.sub(r'\s', '', x),infos)) for info in infos: if "厅" in info: item['rooms'] = info elif "层" in info: item['floor'] = info elif "向" in info: item['toward'] = info elif "m" in info: item['area'] = info else: item['year'] = info item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get() prices = ''.join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall()) item['price'] = re.sub(r'\s', '', prices) item['unit'] = ''.join(dl.xpath(".//dd[@class='price_right']/span[2]//text()").getall()) detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get() origin_url = response.urljoin(detail_url) item['origin_url'] = origin_url yield item next_url = response.xpath("//div[@class='page_al']/p/a/@href").get() yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={"info":(province,city)})
def parse_esf(self, response): # 二手房 province, city_name = response.meta.get('info') dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = ESFHouseItem() #提取二手房title house_title = dl.xpath('//h4[@class="clearfix"]/a/@title').extract_first() if house_title: infos = dl.xpath(".//p[@class='tel_shop']/text()").extract() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) for info in infos: if "厅" in info: item["rooms"] = info elif '层' in info: item["floor"] = info elif '向' in info: item['toward'] = info elif '㎡' in info: item['area'] = info elif '年建' in info: item['build_year'] = re.sub("年建", "", info) #省、市 item['province'] = province item['city'] = city_name #房子标题介绍 item['house_title'] = house_title #小区名字 item['house_name'] = dl.xpath('.//p[@class="add_shop"]/a/@title').extract_first() #联系人 item['contacts'] = dl.xpath('.//p[@class="tel_shop"]/span[@class="people_name"]/a/text()').extract_first() if dl.xpath('.//p[@class="tel_shop"]/span[@class="people_name"]/a/text()') else '暂无联系人' #地址 item['address'] = dl.xpath('.//p[@class="add_shop"]/span/text()').extract_first() #房屋卖点 item['tags'] = '/'.join(dl.xpath('.//dd/p[3]/span/text()').extract()) if response.xpath('.//dd/p[3]/span/text()') else '暂无卖点' # 总价 price = dl.xpath('//dd[@class="price_right"]/span[1]/b/text()').extract_first() price_unit = dl.xpath('//dd[@class="price_right"]/span[1]/text()').extract_first() item['price'] = price + price_unit # 每平米均价 item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").extract_first() # 详情页url detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").extract_first() item['origin_url'] = response.urljoin(detail_url) yield item # 下一页 last_url = response.xpath('//div[@class="page_al"]/p/a[contains(.,"末页")]/@href').extract_first() # '/house/i3100/' #如果某个冷门城市只有一页数据,last_url就不存在,.split('/')出异常 if last_url: last_page = last_url.split('/')[-2].replace('i3','') for i in range(1,int(last_page)+1): next_url = urljoin(response.url,'/house/i3{page}/'.format(page=i)) if next_url: yield scrapy.Request(url=next_url, callback=self.parse_esf, meta={'info': (province, city_name)} )
def parse_esf(self, response): province, city = response.meta.get("info") dls = response.xpath("//div[contains(@class, 'shop_list')]/dl[@dataflag = 'bg']") for dl in dls: item = ESFHouseItem(province = province, city = city) # 房子名字 name = dl.xpath(".//p[@class = 'add_shop']/a/@title").get() item["name"] = name # 信息(几室几厅(rooms),面积(area), 层(floor), 朝向(toward), 年代(year)) infos = dl.xpath(".//p[@class = 'tel_shop']/text()").getall() infos = "".join(infos).strip() infos = re.sub(r"'|\|\r|\n|/s| ", "", infos) item['infos'] = infos # 地址 address = dl.xpath(".//p[@class = 'add_shop']/span/text()").get() item['address'] = address # 价格 price = dl.xpath(".//dd[@class = 'price_right']/span[1]//text()").getall() price = "".join(price) item['price'] = price # 均价 unit = dl.xpath(".//dd[@class = 'price_right']/span[2]/text()").get() item['unit'] = unit # 原始url origin_url = dl.xpath(".//h4[@class = 'clearfix']/a/@href").getall() origin_url = "".join(origin_url) origin_url = response.urljoin(origin_url) item['origin_url'] = origin_url yield item # 下一页url next_url = response.xpath("//div[@class = 'page_al']/p[last()-1]/a/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
def parse_esf(self, response): # 二手房 province, city = response.meta.get('info') dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = ESFHouseItem(province=province, city=city) name = dl.xpath(".//span[@class='tit_shop']/text()").get() if name: infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) for info in infos: if "厅" in info: item["rooms"] = info elif '层' in info: item["floor"] = info elif '向' in info: item['toward'] = info elif '㎡' in info: item['area'] = info elif '年建' in info: item['year'] = re.sub("年建", "", info) item['address'] = dl.xpath( ".//p[@class='add_shop']/span/text()").get() # 总价 item['price'] = "".join( dl.xpath(".//span[@class='red']//text()").getall()) # 单价 item['unit'] = dl.xpath( ".//dd[@class='price_right']/span[2]/text()").get() item['name'] = name detail = dl.xpath(".//h4[@class='clearfix']/a/@href").get() item['origin_url'] = response.urljoin(detail) yield item
def parse_esf(self, response): # captcha_url = response.css('.image img::attr(src)').get() # 获取验证码 # yzm_url = response.urljoin(captcha_url) # print(yzm_url) # if len(yzm_url) > 0: # province, city = response.meta.get('info') # 元祖解包 # formdata = { # 'submit': '提交' # } # code = self.text_captcha(yzm_url) # formdata['code'] = code # print(formdata) # url = response.url # yield scrapy.FormRequest(url=url, callback=self.parse_esf, # meta={'info':(province, city)}, formdata=formdata) # else: province, city = response.meta.get('info') # 元祖解包 dls = response.xpath('//div[contains(@class,"shop_list")]/dl') for dl in dls: item = ESFHouseItem(province=province, city=city) name = dl.xpath('.//p[@class="add_shop"]/a/text()').get() if name == None: pass else: item['name'] = re.sub(r'\s', '', name) infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall() infos = list(map(lambda x: re.sub(r'\s', '', x), infos)) for info in infos: if '厅' in info: item['rooms'] = info elif '层' in info: item['floor'] = info elif '向' in info: item['toward'] = info elif '建' in info: item['year'] = info elif '㎡' in info: item['area'] = info item['address'] = dl.xpath( './/p[@class="add_shop"]/span/text()').get() item['unit'] = dl.xpath( './/dd[@class="price_right"]/span[not(@class)]/text()').get() item['price'] = "".join( dl.xpath( './/dd[@class="price_right"]/span[@class="red"]//text()'). getall()) detail_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get() item['origin_url'] = response.urljoin(detail_url) yield item next_url = response.xpath('//div[@class="page_al"]/p/a/@href').get() next_text = response.xpath('//div[@class="page_al"]/p/a/text()').get() if next_text == '下一页': next_page = response.urljoin( next_url) # 拼接URL urljoin(start_urls, next_page) print(next_page) yield scrapy.Request(url=next_page, callback=self.parse_esf, meta={'info': (province, city)})
def parse_esf(self, response): province, city = response.meta.get('info') dls = response.xpath("//div[contains(@class,'shop_list')]//dl") for dl in dls: contain_house_info = dl.xpath( ".//p[@class='add_shop']/a/text()").get() if contain_house_info: item = ESFHouseItem(province=province, city=city) name = dl.xpath( ".//p[@class='add_shop']/a/text()").get().strip() item['name'] = name item['province'] = province item['city'] = city infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() break for info in infos: if '厅' in info: item['rooms'] = info.strip() elif '层' in info: item['floor'] = info.strip() elif '向' in info: item['toward'] = info.strip() elif '年' in info: item['year'] = info.replace("建", "") else: item['area'] = info item['address'] = dl.xpath( ".//p[@class='add_shop']/span/text()").get() price = "".join( dl.xpath( ".//dd[@class='price_right']/span[@class='red']//text()" ).getall()) item['price'] = price item['unit'] = "".join( dl.xpath( ".//dd[@class='price_right']/span[not(@class)]/text()" ).getall()) detail_url = dl.xpath(".//h4/a/@href").get() item['origin_url'] = response.urljoin(detail_url) yield item next_url = None als = response.xpath("//div[@class='page_al']/p//a") for al in als: if al.xpath(".//text()").get().strip() == '下一页': next_url = al.xpath("./@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)}, dont_filter=True)
def parse_esf(self, response): province, city = response.meta.get('info') dls = response.xpath("//div[contains(@class,'shop_list')]/dl") for dl in dls: if dl.xpath(".//p[@class='add_shop']/a/text()").get() == None: continue name = dl.xpath(".//p[@class='add_shop']/a/text()").get().strip() infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) rooms = '' floor = '' toward = '' year = '' area = '' for info in infos: if "厅" in info or "拼" in info or "栋" in info or "排" in info: rooms = info elif "层" in info: floor = info elif "向" in info: toward = info elif "年" in info: year = info.replace("年建", "") elif "㎡" in info: area = info address = dl.xpath( ".//p[@class='add_shop']/span/text()").get().strip() price = "".join( dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall( )).strip() unit = "".join( dl.xpath(".//dd[@class='price_right']/span[2]//text()").get(). strip()) detail_url = dl.xpath(".//h4/a/@href").get() origin_url = response.urljoin(detail_url) item = ESFHouseItem(province=province, city=city, name=name, rooms=rooms, floor=floor, toward=toward, year=year, address=address, area=area, price=price, unit=unit, origin_url=origin_url) yield item next_url = response.xpath( "//div[@class='page_al']/p[last()-2]/a/@href").get().strip() next_text = response.xpath( "//div[@class='page_al']/p[last()-2]/a/text()").get().strip() if next_url and "下一页" in next_text: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
def parse_esf(self, response): province, city = response.meta.get('info') dls = response.xpath("//div[contains(@class,'shop_list')]/dl") for dl in dls: item = ESFHouseItem(province=province, city=city) name = dl.xpath(".//p[@class='add_shop']/a/text()").get() if name: name = name.strip() item['name'] = name infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) for info in infos: if "厅" in info: item['rooms'] = info elif "层" in info: item['floor'] = info elif "向" in info: item['toward'] = info elif "㎡" in info: item['area'] = info elif "年" in info: item['year'] = info address = dl.xpath(".//p[@class='add_shop']/span/text()").get() item['address'] = address price = dl.xpath( ".//dd[@class='price_right']/span[@class='red']//text()" ).getall() price = "".join(list(map(lambda x: re.sub(r"\s", "", x), price))) unit = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get() item['price'] = price item['unit'] = unit suffix_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get() item['origin_url'] = response.urljoin(suffix_url) try: item['year'] except: item['year'] = '' try: item['rooms'] except: item['rooms'] = '' try: item['floor'] except: item['floor'] = '' try: item['toward'] except: item['toward'] = '' yield item next_url = response.xpath( "//div[@class='page_box']/p[3]/a/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={'info': (province, city)})
def parse_esf(self, response): province, city = response.meta.get('info') dls = response.xpath('//div[contains(@class,"hop_list")]/dl') for dl in dls: name = dl.xpath(".//p[@class='add_shop']/a/text()").get() if not name: continue item = ESFHouseItem(province=province, city=city) item['name'] = name.strip() # print(name) infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r'\s', "", x), infos)) # print(infos) for info in infos: if "厅" in info: item['rooms'] = info elif "层" in info: item['floor'] = info elif "向" in info: item['toward'] = info elif "年建" in info: item['year'] = info.replace("年建", "") elif "㎡" in info: item['area'] = info else: pass # print(item) item['address'] = dl.xpath( ".//p[@class='add_shop']/span/text()").get() # print(address) item['price'] = "".join( dl.xpath( ".//dd[@class='price_right']/span[1]//text()").getall()) item['unit'] = dl.xpath( ".//dd[@class='price_right']/span[2]/text()").get() detail_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get() item['origin_url'] = response.urljoin(detail_url) # print(item) yield item next_url = response.xpath( "//div[@id='list_D10_15']/p[3]/a/@href").get() # print(next_url) if not next_url: url = response.xpath("//div[@id='list_D10_15']/p[1]/a/@href").get() if not url == '/house/': next_url = url # print(next_url) # print('=='*30) if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
def parse_esf(self, response): province, city = response.meta.get("info") dls = response.xpath("//div[contains(@class,'shop_list')]/dl") item = ESFHouseItem() for dl in dls: item = ESFHouseItem(province=province, city=city) name = dl.xpath(".//p[@class='add_shop']/a/text()").get() if name is not None: name = name.strip() item['name'] = name infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r'\s', "", x), infos)) for info in infos: if "厅" in info: item['rooms'] = info elif '层' in info: item['floor'] = info elif '向' in info: item['toward'] = info elif '年' in info: item['year'] = info.replace('年建', "") elif '㎡' in info: item["area"] = info # print(item) address = dl.xpath(".//p[@class='add_shop']/span/text()").get() item['address'] = address item['price'] = "".join( dl.xpath( ".//dd[@class='price_right']/span[1]//text()").getall()) item['unit'] = "".join( dl.xpath( ".//dd[@class='price_right']/span[2]//text()").getall()) item['origin_url'] = response.urljoin( (dl.xpath(".//h4[@class='clearfix']/a/@href").get())) # print(item['origin_url']) yield item next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get() next_url = response.urljoin(next_url) yield scrapy.Request(url=next_url, callback=self.parse_esf, meta={'info': (province, city)})
def parse_esf(self, response): province, city = response.meta.get("info") print(province + " " + city) dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: name = dl.xpath(".//p[@class='add_shop']/a/@title").get() if name == None: continue address = dl.xpath(".//p[@class='add_shop']/span/text()").get() shops = "".join( dl.xpath(".//p[@class='tel_shop']//text()").getall()) shops = re.sub(r"\s", "", shops) shops = shops.split("|") toward = None rooms = shops[0] area = shops[1] floor = shops[2] if len(shops) > 5: toward = shops[3] year = shops[4] else: year = shops[3] url = response.urljoin( dl.xpath(".//h4[@class='clearfix']/a/@href").get()) price = "".join( dl.xpath( ".//dd[@class='price_right']/span[@class='red']//text()"). getall()) unit = dl.xpath( ".//dd[@class='price_right']/span[not(@class='red')]//text()" ).get().strip() item = ESFHouseItem(name=name, address=address, toward=toward, rooms=rooms, area=area, floor=floor, year=year, url=url, price=price, unit=unit, province=province, city=city) yield item next_url = response.xpath( "//div[@class='page_al']/p[last()-2]/a/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
def parse_esf(self, response): print(response.url) provinces, city = response.meta.get("info") item = ESFHouseItem(provinces=provinces, city=city) #获取所有的dls dls = response.xpath('//div[contains(@class,"shop_list")]/dl') for dl in dls: item["name"] = dl.xpath('.//p[@class="add_shop"]/a/@title').get() infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) for info in infos: print(info) if '室' in info: item["rooms"] = info elif '层' in info: item["floor"] = info elif '向' in info: item["toward"] = info elif '㎡' in info: item['area'] = info else: item["year"] = info.replace("建筑年代", "") #地址 item['address'] = dl.xpath( './/p[@class="add_shop"]/span/text()').get() #总价格 price_s = dl.xpath( './/dd[@class="price_right"]/span/b/text()').get() price_w = dl.xpath( './/dd[@class="price_right"]/span[1]/text()').get() if price_s and price_w: item['price'] = ''.join(price_s) + ''.join(price_w) else: item['price'] = ' ' # #多少一平米 item['unit'] = dl.xpath( './/dd[@class="price_right"]/span[2]/text()').get() # origin_url item['origin_url'] = response.urljoin( dl.xpath('.//h4/a/@href').get()) print(item, response.url, city) yield item next_url = response.xpath('//div[@class="page_al"]/p[1]/a/@href').get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (provinces, city)})
def parse_esf(self, response): province, city = response.meta.get('info') dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = {} infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) if infos is not None: name = dl.xpath(".//span[@class='tit_shop']/text()").get() address = dl.xpath(".//p[@class='add_shop']/span/text()").get() price = dl.xpath(".//dd[@class='price_right']/span[@class='red']//text()").getall() price = re.sub("\s", "", "".join(price)) unit = dl.xpath(".//dd[@class='price_right']/span[last()]/text()").get() unit = re.sub("\s", "", unit) origin_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get() origin_url = response.url[:-2] + origin_url item['rooms'] = infos[0] item['floor'] = infos[2] item['area'] = infos[1] item['toward'] = infos[3] try: year = infos[4] item['year'] = re.sub("年建", "", year) except: item['year'] = '' item['address'] = address item['price'] = price try: item['unit'] = unit except: item['unit'] = '' item['unit'] = unit item['origin_url'] = origin_url item['province'] = province item['city'] = city item['name'] = name item = ESFHouseItem(**item) yield item next_url = response.xpath("//div[@class='page_al']/p/a/@href").get() next_url = response.url[:-2] + next_url if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
def parse_esf(self, response): province, city = response.meta.get('info') dls = response.xpath('//div[@class="shop_list shop_list_4"]/dl') for dl in dls: item = ESFHouseItem() item['province'] = province item['city'] = city item['name'] = dl.xpath('.//p[@class="add_shop"]/a/@title').get() infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall() infos = list(map(lambda x: re.sub(r'\s', '', x), infos)) for info in infos: if '厅' in info: item['rooms'] = info elif '层' in info: item['floor'] = info elif '向' in info: item['toward'] = info elif '年' in info: item['year'] = info elif '㎡' in info: item['area'] = info item['address'] = dl.xpath( './/p[@class="add_shop"]/span/text()').get() # 总价 item['price'] = ''.join( dl.xpath( './/dd[@class="price_right"]/span[1]//text()').getall()) # 单价 item['unit'] = ''.join( dl.xpath( './/dd[@class="price_right"]/span[2]//text()').getall()) detail_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get() item['origin_url'] = response.urljoin(detail_url) yield item print(item) print('==' * 40) next_url = response.xpath( '//div[@id="list_D10_15"]/p[1]/a/@href').get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={'info': (province, city)})
def parse_esf(self,response): province, city = response.meta.get("info") print("esf response:",response.url) dl_list = response.xpath("//div[contains(@class,'shop_list')]//dl[@class='clearfix']") for dl in dl_list: if dl is not None: item = ESFHouseItem(province=province,city=city) item['name'] =dl.xpath(".//p[@class='add_shop']/a/@title").get() infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x:re.sub(r"\s","",x),infos)) item['rooms'] = None item['floor'] = None item['toward'] = None item['area'] = None item['year'] = None item['origin_url'] = None if infos: for info in infos: if '厅' in info: item['rooms']= info elif '层' in info: item['floor'] = info elif '向' in info: item['toward'] = info elif '㎡' in info: item['area'] = info elif '年' in info: item['year'] = info item["address"] = dl.xpath(".//p[@class='add_shop']/span/text()").get() item["price"] = "".join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall()) item["unit"] = "".join(dl.xpath(".//dd[@class='price_right']/span[2]/text()").getall()) detail = dl.xpath(".//h4[@class='clearfix']/a/@href").get() if detail is not None: item['origin_url'] =response.urljoin(detail) #print(item['origin_url'] ) #item['origin_url'] = dl.xpath("./dt/a/@href").get() yield item #下一页 next_url = response.xpath("//a[text()='下一页']/@href").get() next_url = response.urljoin(next_url) if next_url: yield scrapy.Request( url=next_url, callback=self.parse_newhourse, meta={"info": (province, city)} )
def parse_esf(self, response): province, city = response.meta.get('info') #print(name) dls = response.xpath("//dl[contains(@dataflag,'bg')]") for dl in dls: item = ESFHouseItem(province=province, city=city) name = ''.join( dl.xpath(".//dd//p[@class='add_shop']/a/@title").getall()) name = re.sub(r"\s", "", name) item['name'] = name infos = dl.xpath(".//dd//p[@class='tel_shop']//text()").getall() infos = list(map(lambda x: re.sub(r"\s|\|", '', x), infos)) infos = list(filter(None, infos)) for info in infos: if "厅" in info: item['rooms'] = info elif '层' in info: item['floor'] = info elif '年' in info: item['year'] = info elif '向' in info: item['toward'] = info elif '㎡' in info: item['area'] = info address = "".join( dl.xpath(".//dd//p[@class='add_shop']//span//text()").getall()) item['address'] = address price = "".join( dl.xpath( ".//dd[@class='price_right']//span[@class='red']//text()"). getall()) item['price'] = price unit = "".join( dl.xpath( ".//dd[@class='price_right']//span[2]//text()").getall()) item['unit'] = unit detail_url = "".join( dl.xpath(".//h4[@class='clearfix']/a/@href").getall()) item['origin_url'] = response.urljoin(detail_url) yield item next_url = response.xpath( "//div[@class='page_al']//p[1]/a/@href").get() yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": {province, city}})
def parse_esf(self, response): province, city = response.meta.get("info") dls = response.xpath("//div[contains(@class, 'shop_list')]/dl") for dl in dls: item = ESFHouseItem(province=province, city=city) name = dl.xpath(".//span[@class='tit_shop']/text()").get() if not name: continue infos = "".join( dl.xpath(".//p[@class='tel_shop']//text()").getall()).strip() infos = infos.split("|") if infos[0] == "独栋": item['rooms'] = infos[1].strip() + "[别墅]" item['area'] = infos[3].strip() item['floor'] = infos[2].strip() item['toward'] = infos[4].strip() else: item['rooms'] = infos[0].strip() item['area'] = infos[1].strip() item['floor'] = infos[2].strip() item['toward'] = infos[3].strip() try: item['year'] = infos[4].strip() except IndexError: print("没有年份记录") address = dl.xpath(".//p[@class='add_shop']/span/text()").get() item['address'] = address price_text = "".join( dl.xpath( ".//dd[@class='price_right']//text()").getall()).strip() price_text = re.sub(r'\s', '', price_text) price = price_text.split("万")[0].strip() + "万" unit = price_text.split("万")[1].strip() item['price'] = price item['unit'] = unit origin_url_text = dl.xpath( ".//h4[@class='clearfix']/a/@href").get() origin_url = response.urljoin(origin_url_text) item['origin_url'] = origin_url yield item next_url = response.xpath( ".//div[@class='page_al']/p[1]/a/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
def parse_esf(self,response): province,city = response.meta.get("info") dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = ESFHouseItem(province=province,city=city) item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").getall() infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x:re.sub(r"\s","",x),infos)) for info in infos: try: if "厅" in info: item['rooms'] = info elif "卧室" in info: item['rooms'] = info elif "㎡" in info: item['area'] = info elif "层" in info: item['floor'] = info elif "叠加" in info: item['floor'] = info elif "双拼" in info: item['floor'] = info elif "独栋" in info: item['floor'] = info elif '向' in info: item['toward'] = info elif '年建' in info: item['year'] = info.replace('年建',"") except: pass #print(item) item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get() item['price'] = dl.xpath(".//dd[@class='price_right']/span/b/text()").get() try: unit = dl.xpath(".//dd[@class='price_right']//span/text()").getall() item['unit'] = unit[1] except: pass origin_url = response.urljoin(dl.xpath(".//h4[@class='clearfix']/a/@href").get()) item['origin_url'] = origin_url #print(item) yield item next_url = response.urljoin(response.xpath("//div[@class='page_al']/p/a/@href").get()) yield scrapy.Request(url=next_url,callback=self.parse_esf,meta={"info":(province,city)})
def parse_esf(self, response): province, city = response.meta.get("info") dls = response.xpath("//dl[@class='clearfix']") for dl in dls: item = ESFHouseItem(province=province, city=city) name = dl.xpath(".//p[@class='add_shop']/a/text()").get() if name is not None: name = name.strip() item['name'] = name infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) for info in infos: if "厅" in info: item['rooms'] = info elif "层" in info: item['floor'] = info elif "向" in info: item['toward'] = info elif "年建" in info: item['year'] = info.replace("年建", "") elif "㎡" in info: item['area'] = info # print(item) # print(infos) address = dl.xpath(".//p[@class='add_shop']//span/text()").get() item['address'] = address price = "".join( dl.xpath(".//dd[@class='price_right']/span//text()").getall() [:2]) unit = dl.xpath( ".//dd[@class='price_right']/span[2]//text()").get() item['price'] = price item['unit'] = unit origin_url = response.urljoin( dl.xpath(".//h4[@class='clearfix']/a/@href").get()) item['origin_url'] = origin_url yield item next_url = response.xpath("//div[@class='page_al']/p[2]/a/@href").get() yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
def parse_esf(self,response): province,city = response.meta.get('info') dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = ESFHouseItem(province = province,city = city) item['name'] = dl.xpath(".//p[contains(@class,'add_shop')]/a/text()").get() if item['name'] is not None: item['name'] = item['name'].strip() infos = dl.xpath(".//p[@class='tel_shop']//text()").getall() infos = list(map(lambda x:re.sub(r"\s|","",x),infos)) #print(infos) for info in infos: if "厅" in info: item['rooms'] = info elif "㎡" in info: item['area'] = info elif "层" in info: item['floor'] = info elif "向" in info: item['toward'] = info elif "建" in info: item['year'] = info #print(item) item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get() if item['address'] is not None: item['address'] = item['address'] item['price'] ="".join(dl.xpath(".//span[contains(@class,'red')]//text()").getall()) if item['price'] is not "": item['price'] = re.sub(r"\s|热搜","",item['price']) #print(item['price']) item['unit'] = dl.xpath(".//dd[contains(@class,'price_right')]/span[2]/text()").get() if item['unit'] is not None: item['unit'] = item['unit'] #print(item['unit']) ori_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get() if ori_url is not None: item['origin_url'] = response.urljoin(ori_url) #print(item['origin_url']) #print(item) yield item next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get() yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={"info":(province,city)})
def parse_esf(self, response): province, city, cityabbr = response.meta.get("info") temp_url = "https://" + cityabbr + ".esf.fang.com" dls = response.xpath("//div[contains(@class,'shop_list')]/dl") for dl in dls: item = ESFHouseItem(province=province, city=city) item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").get() item['address'] = dl.xpath( ".//p[@class='add_shop']/span/text()").get() item['price'] = "".join( dl.xpath( ".//dd[@class='price_right']/span[1]//text()").getall()) item['unit'] = dl.xpath( ".//dd[@class='price_right']/span[2]//text()").get() origin_url = dl.xpath(".//h4/a/@href").get() item["origin_url"] = urljoin(temp_url, origin_url) infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) # print(name) # print(address) # print(infos) for info in infos: if "厅" in info: item['rooms'] = info elif "层" in info: item["floor"] = info elif '向' in info: item["toward"] = info elif '建' in info: item["year"] = info.replace("年建", "") elif '㎡' in info: item["area"] = info # print(item) next_url = response.xpath("//a[text()='下一页']/@href").get() yield scrapy.Request(url=response.urljoin(temp_url, next_url), callback=self.parse_esf, meta={"info": (province, city, cityabbr)})
def parse_esf_house(self, response): province, city = response.meta.get('info') dls = response.xpath('//div[@class="houseList"]/dl') for dl in dls: name = dl.xpath('.//p[@class="title"]/a/@title').get() describe = dl.xpath('.//p[@class="mt12"]/text()').getall() try: rooms = describe[0].strip() except Exception: rooms = "" try: floor = describe[1].strip() except Exception: floor = "" try: toward = describe[2].strip() except Exception: toward = "" try: year = describe[3].strip().split(":")[1] except Exception: year = "" address = dl.xpath('.//p[@class="mt10"]/span/@title').get() area = dl.xpath('.//div[contains(@class,"area")]/p/text()').get() price = "".join( dl.xpath('.//div[@class="moreInfo"]/p/span/text()').getall() [0:2]) unit = "".join( dl.xpath( './/div[@class="moreInfo"]/p[last()]//text()').getall()) origin_url = response.url item = ESFHouseItem(province=province,city=city,name=name,rooms=rooms,floor=floor,toward=toward,\ year=year,address=address,area=area,price=price,unit=unit,origin_url=origin_url) yield item next_page = response.xpath( '//a[@id="PageControl1_hlk_next"]/@href').get() if next_page: yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_esf_house, meta={"info": (province, city)})
def parse_esfhoust(self, response): province, city = response.meta.get("info") item = ESFHouseItem(province=province, city=city) dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").get() infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) for info in infos: if "厅" in info: item['rooms'] = info elif "层" in info: item["floor"] = info elif "向" in info: item['toward'] = info elif "年" in info: item['year'] = re.sub(r"建", "", info) # item['year']=info.replace("建","")#这样也可以 elif "㎡" in info: item['area'] = info item['address'] = dl.xpath( ".//p[@class='add_shop']/span/text()").get() item['price'] = "".join( dl.xpath( ".//dd[@class='price_right']/span[@class='red']//text()"). getall()) item['unit'] = "".join( dl.xpath( ".//dd[@class='price_right']/span[2]/text()").getall()) origin_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get() item['origin_url'] = response.urljoin(origin_url) print(item['origin_url']) yield item next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get() next_url = response.urljoin(next_url) if next_url: yield scrapy.Request(url=next_url, callback=self.parse_esfhoust, meta={"info": (province, city)})
def parse_esf(self, response): province, city = response.meta.get("info") dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = ESFHouseItem(province=province, city=city) item['name'] = dl.xpath("//p[@class='add_shop']/a/text()").get() infos = dl.xpath("//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) for info in infos: if "厅" in info: item['rooms'] = info elif '层' in info: item['rooms'] = info elif '向' in info: item['toward'] = info elif '㎡' in info: item['area'] = info else: item['year'] = info.replace("建筑年代:", "") item['address'] = dl.xpath( ".//p[@class='add_shop']/span/text()").get() item['price'] = dl.xpath( ".//dd[@class='price_right']/span[@class='red']//text()" ).getall() # 等价于 # item['price'] = dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall() item['unit'] = dl.xpath( ".//dd[@class='price_right']/span[last()]/text()").getall() item['origin_url'] = response.urljoin( dl.xpath(".//dd/h4[@class='clearfix']/a/@href").get()) yield item next_url = response.xpath( "//div[@id='list_D10_15']/p[1]/a/@href").get() yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (property, city)})
def parse_esf(self, response): province, city = response.meta.get('info') dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = ESFHouseItem(province=province, city=city) item['name'] = dl.xpath(".//p[@class='add_shop']/a/text()").get() infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x: re.sub(r'\s', '', x), infos)) for info in infos: if '厅' in info: item['rooms'] = info elif '层' in info: item['floor'] = info elif '向' in info: item['toward'] = info elif '㎡' in info: item['area'] = info else: item['year'] = info.replace('年建', '') item['address'] = dl.xpath( ".//p[@class='add_shop']/span/text()").get() item['price'] = "".join( dl.xpath( ".//dd[@class='price_right']/span[1]//text()").getall()) item['unit'] = dl.xpath( ".//dd[@class='price_right']/span[2]//text()").get() detail_url = dl.xpath(".//dd/h4/a/@href").get() item['origin_url'] = response.urljoin(detail_url) yield item next_url = response.xpath("//div[@class='page_al']/p/a/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
def parse_esf(self, response): # 获取省份和城市 province, city = response.meta.get('info') dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = ESFHouseItem(province=province, city=city) # 获取小区名字 name = dl.xpath(".//p[@class='add_shop']/a/text()").get() if name == None: pass else: item['name'] = name.strip() # print(name) # 获取综合信息 infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() if len(infos) == 0: pass else: infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) # print(infos) for info in infos: if "厅" in info: item['rooms'] = info elif '层' in info: item['floor'] = info elif '向' in info: item['toward'] = info elif '年' in info: item['year'] = info elif '㎡' in info: item['area'] = info # print(item) # 获取地址 address = dl.xpath(".//p[@class='add_shop']/span/text()").get() if address == None: pass else: # print(address) item['address'] = address # 获取总价 price = dl.xpath( "./dd[@class='price_right']/span[1]/b/text()").getall() if len(price) == 0: pass else: price = "".join(price) # print(price) item['price'] = price # 获取单价 unit = dl.xpath("./dd[@class='price_right']/span[2]/text()").get() if unit == None: pass else: # print(unit) item['unit'] = unit # 获取初始url detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get() if detail_url == None: pass else: origin_url = response.urljoin(detail_url) # print(origin_url) item['origin_url'] = origin_url # print(item) yield item next_url = response.xpath(".//div[@class='page_al']/p/a/@href").get() # print(next_url) yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})