def parse(self, response): # print(response) zf = ZufangItem() title_list = response.xpath( ".//div[@class='f-list-item ershoufang-list']/dl/dd[1]/a/text()" ).extract() lease_type_list = response.xpath( ".//div[@class='f-list-item ershoufang-list']/dl/dd[2]/span[@class='first js-huxing']/text()" ).extract() address = response.xpath( ".//div[@class='f-list-item ershoufang-list']/dl/dd[3]/span") address_list = [] for i in address: ads = ''.join(i.xpath('string(.)').extract()[0].split()) # print(ads) address_list.append(ads) # print(address_list) price_list = response.xpath( ".//div[@class='f-list-item ershoufang-list']/dl/dd[5]/div[1]/span[1]/text()" ).extract() # print(len(title_list)) # print(len(lease_type_list)) # print(len(address_list)) # print(len(price_list)) for i, j, x, y in zip(title_list, lease_type_list, address_list, price_list): zf['title'] = i zf['lease_type'] = j zf['address'] = x zf['price'] = y yield zf
def parse(self, response): # pass node_list1 = response.xpath('//ul[@class="listUl"]/li[@logr]') # 下面匹配不到内容 # node_list2 = response.xpath('//ul[@class="listUl"]/li[@class="apartments"]') for node in node_list1: item1 = ZufangItem() title = node.xpath('.//h2/a/text()').extract() info = node.xpath( './/p[contains(@class, "room")]/text()').extract() address = node.xpath( './/p[contains(@class, "add")]/text()').extract() from_ = node.xpath( './/div[@class="jjr"]/text()|.//p[@class="geren"]/text()' ).extract() price = node.xpath('.//div[@class="money"]/b/text()').extract() item1['title'] = title[0] item1['info'] = info[0] item1['address'] = address[0] item1['from_'] = from_[0] item1['price'] = price[0] yield item1
def parse(self, response): item = ZufangItem() #注意div的位置,优先查看源码 dakuangjia = response.xpath('//div[@class="zu-itemmod "]') for xiaokuangjia in dakuangjia: title = xiaokuangjia.xpath('./div[@class="zu-info"]/h3/a/text()').extract() size = xiaokuangjia.xpath('./div[@class="zu-info"]/p[1]/text()').extract() price = xiaokuangjia.xpath('./div[@class="zu-side"]/p/strong/text()').extract() place = xiaokuangjia.xpath('./div[@class="zu-info"]/address/text()').extract() item['title'] = title try: item['size'] = size[1] except IndexError: print('') item['price'] = price item['place'] = place[1].replace(' ','').replace('\xa0\xa0\n','')#用replace()函数去除空格和换行 yield item # prices = response.xpath('//div[@class="zu-side"]/p/strong/text()').extract() # for price in prices: # item['price']=price # yield item # next_page = response.xpath('//div[@class="fanye"]/a[contains(text(),"下一页")]/@href').extract_first() next_page_url = response.xpath('//a[@class="aNxt"]/@href').extract_first() if next_page_url: # print(next_page_url) yield Request(next_page_url,callback=self.parse)
def parse_item(self, response): home_list=response.xpath("//div[@class='content']//li") for home in home_list: item=ZufangItem() item['title']=home.xpath(".//h2/a/text()").extract_first() net_url='http:'+home.xpath(".//div[@class='des']/h2/a/@href").extract_first() yield scrapy.Request(url=net_url,callback=self.parse_next,meta={'item':item})
def parse(self, response): regions = response.xpath('//div[@class="city-list"]/dl') for info in regions: for node in info.xpath("./dd/a"): item = ZufangItem() item['region'] = info.xpath("./dt/text()").extract()[0] item['city'] = node.xpath('./text()').extract()[0] city_url = node.xpath('./@href').extract()[0] yield scrapy.Request(city_url, meta={"meta_1": item}, callback=self.parse_city)
def parse_topic(self, response): # import pdb;pdb.set_trace() zufang_item = ZufangItem() zufang_item['url'] = response.url zufang_item['group_type'] = 'douban' zufang_item['title'] = self.get_title(response) zufang_item['author'] = self.get_author(response) zufang_item['description'] = self.get_description(response) zufang_item['create_time'] = self.get_create_time(response) yield zufang_item
def parse(self, response): selector = scrapy.Selector(response) items = [] item = ZufangItem() for info in selector.xpath( '//div[@class="f-list-item ershoufang-list"]'): # 租房标题 item['title'] = info.xpath('./dl/dd[1]/a/text()').extract()[0] # 价格 item['price'] = info.xpath( './dl/dd[5]/div[1]/span[1]/text()').extract()[0] # 地址 address = info.xpath('./dl/dd[3]/span/a/text()').extract() address_0 = address[0] + '区' address_else = info.xpath( 'normalize-space(./dl/dd[3]/span/text()[3])').extract() # 剔除地址中无用数据 if len(address) > 2 and ' ' not in address: print((len(address))) item['address'] = address_0 + '-'.join(address[1:3]) elif len(address) == 2 and ' ' not in address: item['address'] = address_0 + ''.join( address[1]) + '-'.join(address_else) else: continue # 房屋信息描述 description = info.xpath( './dl/dd[2]/span[position()>1]/text()').extract() item['pattern'] = ','.join(description) print(description) # 租房类型 item['type'] = info.xpath('./dl/dd[2]/span[1]/text()').extract()[0] # 图片地址 img_url = info.xpath('./dl/dt/div/a/img/@data-original').extract() if len(img_url): item['img'] = "".join(img_url) else: item['img'] = info.xpath('./dl/dt/div/a/img/@src').extract()[0] items.append(item) yield item # 翻页 next_page = response.xpath( ".//div[@class='pageBox']/ul/li/a[@class='next']/@href" ).extract_first() if next_page: url = response.urljoin(next_page) # 爬每一页 yield scrapy.Request(url, self.parse)
def parse(self, response): zf = ZufangItem() title_list = response.xpath( ".//div[@class='f-list-item ']/dl/dd[1]/a/text()").extract() money_list = response.xpath( ".//div[@class='f-list-item ']/dl/dd[5]/div[1]/span[1]/text()" ).extract() for i, j in zip(title_list, money_list): zf['title'] = i zf['money'] = j yield zf
def parse_item(self, response): item = ZufangItem() item['name'] = response.xpath( '//*[contains(@class,"room-name")]/h1/text()').extract() item['url'] = response.url item['url_id'] = self.get_md5(response.url) item['ifhezu'] = response.xpath( '//*[contains(@class,"methodroom-rent")]/text()').extract() item['zhuangtai'] = self.remove_kongge( response.xpath( '//*[contains(@class,"room-title")]//text()').extract()) item['money'] = response.xpath( '//*[contains(@class,"room-price-num")]/text()')[0].extract() item['one_money'] = self.remove_kongge( response.xpath('//*[contains(@class,"room-price-sale")]//text()') [0].extract()) item['size'] = response.xpath( '//*[contains(@class,"room-detail-box")] [1]//label[1]/text()' )[0].extract() item['number'] = response.xpath( '//*[contains(@class,"room-detail-box")] [1]//label[1]/text()' )[1].extract() item['type'] = self.remove_kongge( response.xpath( '//*[contains(@class,"room-detail-box")] [1]//label[1]/text()') [2].extract()) item['orientation'] = response.xpath( '//*[contains(@class,"room-detail-box")] [2]//label[1]/text()' )[1].extract() item['floor'] = response.xpath( '//*[contains(@class,"room-detail-box")] [2]//label[1]/text()' )[0].extract() item['location'] = response.xpath( '//*[contains(@class,"detail-roombox")]/@title').extract() item['subway'] = response.xpath( '//*[contains(@class,"room-detail-box")] [2]//label[1]/text()' )[6].extract() item['deploy'] = self.remove_kongge( response.xpath( '//*[contains(@class,"room-info-list")]/table/tr[2]//text()'). extract()) if self.if_roomie(response): item['roomie'] = self.get_roomie(response) return item
def parse_content(self, response): item = ZufangItem() item['name'] = response.xpath( '//div[@class="wrapper"]/div/h3/text()').extract()[0] item['address'] = response.xpath( '//div[@class="box"]//dl[5]/dd/a/text()').extract() item['price'] = response.xpath( '///div[@class="box"]//dd/strong/span/text()').extract()[0] item['typehouse'] = response.xpath( '//div[@class="box"]//dl[4]/dd/text()').extract()[0] item['region'] = response.xpath( '//div[@class="box"]//dl[6]/dd/a/text()').extract() item['contacts'] = response.xpath( '//div[@class="rbox"]/div/div/h2/text()').extract()[0] item['phone'] = response.xpath( '//div[@class="rbox"]/div/p/text()').extract()[0] yield item
def parse_chuzu(self, response): headers = self.headers.copy() next_page = response.xpath( '//*[@id="bottom_ad_li"]/div[2]/a[@class="next"]/@href' ).extract_first() # print(next_page) item = ZufangItem() messages = response.xpath( '/html/body/div[3]/div[1]/div[5]/div[2]/ul/li')[10:] for message in messages: try: item['href'] = message.xpath( './div[2]/h2/a/@href').extract_first() except: item['href'] = None try: item['describe'] = message.xpath( './div[2]/h2/a/text()').extract_first().strip() except: item['describe'] = None try: item['room'] = message.xpath('./div[2]/p[1]/text()').re_first( '(.*?)\s') except: item['room'] = None try: location = message.xpath('./div[2]/p[2]/a').re('>(.*?)</a>') item['location'] = ' '.join(location) except: item['location'] = None try: item['price'] = message.xpath( './div[3]/div[2]/b/text()').extract_first() except: item['price'] = None yield item if next_page: time.sleep(30) yield Request(next_page, headers=headers, callback=self.parse_chuzu)
def parse_content(self, response): item = ZufangItem() item['name'] = response.xpath( '//div[@class="house-title"]/h1/text()').extract()[0] item['address'] = response.xpath( '//div[@class="house-desc-item fl c_333"]/ul/li[3]/span/text()' ).extract() item['price'] = response.xpath( '//div[@class="house-desc-item fl c_333"]/div/span/b/text()' ).extract()[0] item['typehouse'] = response.xpath( '//div[@class="house-desc-item fl c_333"]/ul/li[1]/span/text()' ).extract() item['region'] = response.xpath( '//div[@class="house-desc-item fl c_333"]/ul/li[4]/span/a/text()' ).extract()[0] item['contacts'] = response.xpath( '//div[@class="house-agent-info fr"]/p/a/text()').extract()[0] item['phone'] = response.xpath( '//div[@class="house-chat-phone"]/span/text()').extract()[0] yield item
def parse(self, response): '''/ html / body / div[5] / div / div[5] / div[2] / ul / li[1] / div[2] / h2 / a / html / body / div[5] / div / div[5] / div[2] / ul / li[2] / div[2] / h2 / a / html / body / div[5] / div / div[5] / div[2] / ul / li[2] / div[3] / div[2] / b / html / body / div[5] / div / div[5] / div[2] / ul / li[2] / div[2] / p[2] / text() / html / body / div[5] / div / div[5] / div[2] / ul / li[2] / div[2] / p[1]''' s = etree.HTML(response.text) print('开始爬虫') print(s) datas = s.xpath( '/ html / body / div[5] / div / div[5] / div[2] / ul / li') for data in datas: name = data.xpath('./ div[2] / h2 / a/text()') price = data.xpath('./ div[3] / div[2] / b/text()') community = data.xpath('./ div[2] / p[2] / text()') decorate = data.xpath('./ div[2] / p[1]/text()') print(name, price, community, decorate) item = ZufangItem(name=name, price=price, community=community, decorate=decorate) yield item pass
def detail_page(self, response): city = response.xpath( '//div[@class="fl l-txt"]/a[2]/text()').extract()[0][:-2] address_temp = response.xpath( '//div[@class="zf-room"]/p[7]/a/text()').extract() address = ''.join(address_temp) name = response.xpath( '//div[@class="zf-room"]/p[6]/a/text()').extract()[0] price_temp = response.xpath( '//div[@class="price "]//span[1]/text()').extract() price = ''.join(price_temp) price_num = price_temp[0] area = response.xpath( '//div[@class="zf-room"]/p[1]/text()').extract()[0] area_num = area[:-2] type = response.xpath( '//div[@class="zf-room"]/p[2]/text()').extract()[0].replace( ' ', '') floor = response.xpath( '//div[@class="zf-room"]/p[3]/text()').extract()[0] direction = response.xpath( '//div[@class="zf-room"]/p[4]/text()').extract()[0] date_temp = response.xpath( '//div[@class="zf-room"]/p[8]/text()').extract()[0] insert_time = self.detail_date(date_temp) detail_page = response.url try: img = response.xpath( '//div[@class="thumbnail"]/ul/li[1]/@data-src').extract()[0] except Exception as e: img = 'none' source = 'lianjia' item = ZufangItem() for field in item.fields.keys(): item[field] = eval(field) yield item